Initial commit

2021-06-18 14:11:42 +02:00
commit 3c4768eab8
12 changed files with 1363 additions and 0 deletions
--- a/custom_components/home_assistant_covid19_augsburg/crawler.py
+++ b/custom_components/home_assistant_covid19_augsburg/crawler.py
@@ -0,0 +1,94 @@
+import datetime
+import locale
+import logging
+import re
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+
+import requests
+from bs4 import BeautifulSoup
+
+_log = logging.getLogger(__name__)
+
+
+@dataclass
+class IncidenceData:
+    location: str
+    date: datetime.date
+    incidence: float
+    total_cases: int = 0
+    num_infected: int = 0
+    num_recovered: int = 0
+    num_dead: int = 0
+
+
+class CovidCrawlerBase(ABC):
+    @abstractmethod
+    def crawl(self) -> IncidenceData:
+        pass
+
+
+class CovidCrawler(CovidCrawlerBase):
+    def __init__(self) -> None:
+        self.url = (
+            "https://www.augsburg.de/umwelt-soziales/gesundheit/coronavirus/fallzahlen"
+        )
+
+    def crawl(self) -> IncidenceData:
+        """
+        Fetch COVID-19 infection data from the target website.
+        """
+
+        _log.info("Fetching COVID-19 data update")
+
+        locale.setlocale(locale.LC_ALL, "de_DE.utf8")
+
+        result = requests.get(self.url)
+        if not result.ok:
+            result.raise_for_status()
+
+        soup = BeautifulSoup(result.text, features="html.parser")
+
+        match = soup.find(class_="frame--type-textpic")
+        text = match.p.text
+        _log.debug(f"Infection data text: {text}")
+
+        matches = re.search(r"(\d+,\d+) Neuinfektion", text)
+        if not matches:
+            raise ValueError("Could not extract incidence from scraped web page")
+
+        incidence = locale.atof(matches.group(1))
+        _log.debug(f"Parsed incidence: {incidence}")
+
+        text = match.h2.text
+        matches = re.search(r"\((\d+\. \w+)\)", text)
+        if not matches:
+            raise ValueError("Could not extract date from scraped web page")
+
+        date = datetime.datetime.strptime(matches.group(1), "%d. %B")
+        date = date.replace(year=datetime.datetime.now().year).date()
+        _log.debug(f"Parsed date: {date}")
+
+        match = match.find_next_sibling(class_="frame--type-textpic")
+        text = match.text
+        _log.debug(f"Infection counts text: {text}")
+
+        regexes = [
+            r"Insgesamt: (?P<total_cases>[0-9.]+)",
+            r"genesen: (?P<num_recovered>[0-9.]+)",
+            r"infiziert: (?P<num_infected>[0-9.]+)",
+            r"verstorben: (?P<num_dead>[0-9.]+)",
+        ]
+        cases = {}
+        for r in regexes:
+            matches = re.search(r, text)
+            if not matches:
+                continue
+            cases.update(
+                {k: int(v.replace(".", "")) for k, v in matches.groupdict().items()}
+            )
+
+        result = IncidenceData("Augsburg", incidence, date, **cases)
+        _log.debug(f"Result data: {result}")
+
+        return result