home-assistant-covid19-augs…/custom_components/home_assistant_covid19_augsburg/crawler.py

import datetime
import logging
import re
from abc import ABC, abstractmethod
from dataclasses import dataclass

from bs4 import BeautifulSoup

_log = logging.getLogger(__name__)


def parse_num(s, t=int):
    if len(s):
        return t(s.replace(".", "").replace(",", "."))
    return 0


@dataclass
class IncidenceData:
    location: str
    date: str
    incidence: float
    total_cases: int = 0
    num_infected: int = 0
    num_recovered: int = 0
    num_dead: int = 0


class CovidCrawlerBase(ABC):
    @abstractmethod
    def crawl(self) -> IncidenceData:
        pass


class CovidCrawler(CovidCrawlerBase):
    def __init__(self, hass=None) -> None:
        self.url = (
            "https://www.augsburg.de/umwelt-soziales/gesundheit/coronavirus/fallzahlen"
        )
        self.hass = hass

    async def crawl(self) -> IncidenceData:
        """
        Fetch COVID-19 infection data from the target website.
        """

        _log.info("Fetching COVID-19 data update")

        if self.hass:
            from homeassistant.helpers import aiohttp_client

            result = await aiohttp_client.async_get_clientsession(self.hass).get(
                self.url
            )
            soup = BeautifulSoup(await result.text(), "html.parser")
        else:
            import requests

            result = requests.get(self.url)
            if not result.ok:
                result.raise_for_status()
            soup = BeautifulSoup(result.text, "html.parser")

        match = soup.find(class_="frame--type-textpic")
        text = match.p.text
        _log.debug(f"Infection data text: {text}")

        matches = re.search(r"(\d+,\d+)\sNeuinfektion", text)
        if not matches:
            raise ValueError(
                f"Could not extract incidence from scraped web page, {text=}"
            )

        incidence = parse_num(matches.group(1), t=float)
        _log.debug(f"Parsed incidence: {incidence}")

        text = match.h2.text
        matches = re.search(r"\((\d+)\. (\w+).*\)", text)
        if not matches:
            raise ValueError(f"Could not extract date from scraped web page, {text=}")

        months = [
            "Januar",
            "Februar",
            "März",
            "April",
            "Mai",
            "Juni",
            "Juli",
            "August",
            "September",
            "Oktober",
            "November",
            "Dezember",
        ]
        day = parse_num(matches.group(1))
        month_name = matches.group(2)
        date = datetime.date(
            year=datetime.datetime.now().year,
            month=1 + months.index(month_name),
            day=day,
        )
        _log.debug(f"Parsed date: {date}")

        match = match.find_next_sibling(class_="frame--type-textpic")
        text = match.text
        _log.debug(f"Infection counts text: {text}")

        regexes = [
            r"Insgesamt: (?P<total_cases>[0-9.]+)",
            r"genesen: (?P<num_recovered>[0-9.]+)",
            r"infiziert: (?P<num_infected>[0-9.]+)",
            r"verstorben: (?P<num_dead>[0-9.]+)",
        ]
        cases = {}
        for r in regexes:
            matches = re.search(r, text)
            if not matches:
                continue
            cases.update(
                {
                    k: parse_num(v.replace(".", ""))
                    for k, v in matches.groupdict().items()
                }
            )

        result = IncidenceData(
            "Augsburg", incidence=incidence, date=date.strftime("%Y-%m-%d"), **cases
        )
        _log.debug(f"Result data: {result}")

        return result