feat: Crawling and parsing of vaccination data

See #2
2021-08-10 20:06:41 +02:00
parent 8a97e92458
commit f83bb077c1
4 changed files with 111 additions and 33 deletions
--- a/custom_components/home_assistant_covid19_augsburg/init.py
+++ b/custom_components/home_assistant_covid19_augsburg/init.py
@@ -69,7 +69,7 @@ async def get_coordinator(hass: HomeAssistant):

    async def async_get_data() -> IncidenceData:
        crawler = CovidCrawler(hass)
-        return await crawler.crawl()
+        return await crawler.crawl_incidence()

    hass.data[DOMAIN] = DataUpdateCoordinator(
        hass,
--- a/custom_components/home_assistant_covid19_augsburg/crawler.py
+++ b/custom_components/home_assistant_covid19_augsburg/crawler.py
@@ -15,6 +15,32 @@ def parse_num(s, t=int):
    return 0


+def parse_date(
+    day: int, month: str, year=datetime.datetime.now().year
+) -> datetime.date:
+    """Parse a German medium-form date, e.g. 17. August into a datetime.date"""
+    months = [
+        "Januar",
+        "Februar",
+        "März",
+        "April",
+        "Mai",
+        "Juni",
+        "Juli",
+        "August",
+        "September",
+        "Oktober",
+        "November",
+        "Dezember",
+    ]
+    date = datetime.date(
+        year=int(year),
+        month=1 + months.index(month),
+        day=parse_num(day),
+    )
+    return date
+
+
@dataclass
 class IncidenceData:
    location: str
@@ -26,37 +52,51 @@ class IncidenceData:
    num_dead: int = 0


+@dataclass
+class VaccinationData:
+    date: str
+
+    total_vaccinations: int = 0
+    num_vaccinated_once: int = 0
+    num_vaccinated_full: int = 0
+
+    ratio_vaccinated_once: float = 0.0
+    ratio_vaccinated_full: float = 0.0
+
+
 class CovidCrawlerBase(ABC):
    @abstractmethod
-    def crawl(self) -> IncidenceData:
+    def crawl_incidence(self) -> IncidenceData:
+        pass
+
+    @abstractmethod
+    def crawl_vaccination(self) -> VaccinationData:
        pass


 class CovidCrawler(CovidCrawlerBase):
    def __init__(self, hass=None) -> None:
-        self.url = (
-            "https://www.augsburg.de/umwelt-soziales/gesundheit/coronavirus/fallzahlen"
-        )
        self.hass = hass

-    async def crawl(self) -> IncidenceData:
+    async def crawl_incidence(self) -> IncidenceData:
        """
        Fetch COVID-19 infection data from the target website.
        """

        _log.info("Fetching COVID-19 data update")

+        url = (
+            "https://www.augsburg.de/umwelt-soziales/gesundheit/coronavirus/fallzahlen"
+        )
        if self.hass:
            from homeassistant.helpers import aiohttp_client

-            result = await aiohttp_client.async_get_clientsession(self.hass).get(
-                self.url
-            )
+            result = await aiohttp_client.async_get_clientsession(self.hass).get(url)
            soup = BeautifulSoup(await result.text(), "html.parser")
        else:
            import requests

-            result = requests.get(self.url)
+            result = requests.get(url)
            if not result.ok:
                result.raise_for_status()
            soup = BeautifulSoup(result.text, "html.parser")
@@ -79,27 +119,7 @@ class CovidCrawler(CovidCrawlerBase):
        if not matches:
            raise ValueError(f"Could not extract date from scraped web page, {text=}")

-        months = [
-            "Januar",
-            "Februar",
-            "März",
-            "April",
-            "Mai",
-            "Juni",
-            "Juli",
-            "August",
-            "September",
-            "Oktober",
-            "November",
-            "Dezember",
-        ]
-        day = parse_num(matches.group(1))
-        month_name = matches.group(2)
-        date = datetime.date(
-            year=datetime.datetime.now().year,
-            month=1 + months.index(month_name),
-            day=day,
-        )
+        date = parse_date(matches.group(1), matches.group(2))
        _log.debug(f"Parsed date: {date}")

        match = match.find_next_sibling(class_="frame--type-textpic")
@@ -130,3 +150,57 @@ class CovidCrawler(CovidCrawlerBase):
        _log.debug(f"Result data: {result}")

        return result
+
+    async def crawl_vaccination(self) -> VaccinationData:
+        _log.info("Fetching COVID-19 vaccination data update")
+        url = "https://www.augsburg.de/umwelt-sozgcoiales/gesundheit/coronavirus/impfzentrum"
+        container_id = "c1088140"
+
+        if self.hass:
+            from homeassistant.helpers import aiohttp_client
+
+            result = await aiohttp_client.async_get_clientsession(self.hass).get(url)
+            soup = BeautifulSoup(await result.text(), "html.parser")
+        else:
+            import requests
+
+            result = requests.get(url)
+            if not result.ok:
+                result.raise_for_status()
+            soup = BeautifulSoup(result.text, "html.parser")
+
+        result = soup.find(id=container_id)
+        text = re.sub(r"\s+", " ", result.text)
+        regexes = [
+            r"(?P<total_vaccinations>\d+[.]\d+) Impfdosen",
+            r"Weitere (?P<num_vaccinated_once>\d+[.]\d+) Personen haben die Erstimpfung erhalten",
+            r"(?P<num_vaccinated_full>\d+[.]\d+) Personen sind bereits vollständig geimpft",
+        ]
+        values = {}
+        for r in regexes:
+            matches = re.search(r, text)
+            if not matches:
+                continue
+            values.update(
+                {
+                    k: parse_num(v.replace(".", ""))
+                    for k, v in matches.groupdict().items()
+                }
+            )
+
+        matches = re.search(r"Stand (?P<day>\d+)\. (?P<month>\w+) (?P<year>\d+)", text)
+        if not matches:
+            raise ValueError(f"Could not extract date from scraped web page, {text=}")
+
+        values["date"] = parse_date(**matches.groupdict()).strftime("%Y-%m-%d")
+        result = VaccinationData(**values)
+
+        # Total population in Augsburg as of 2020
+        # https://www.augsburg.de/fileadmin/user_upload/buergerservice_rathaus/rathaus/statisiken_und_geodaten/statistiken/Monitoring/Demografiemonitoring_der_Stadt_Augsburg_2021.pdf
+        population = 299021
+
+        result.ratio_vaccinated_full = result.num_vaccinated_full / population
+        result.ratio_vaccinated_once = result.num_vaccinated_once / population
+        _log.debug(f"Result data: {result}")
+
+        return result
--- a/custom_components/home_assistant_covid19_augsburg/main.py
+++ b/custom_components/home_assistant_covid19_augsburg/main.py
@@ -3,7 +3,10 @@ from .crawler import CovidCrawler

 async def main():
    crawler = CovidCrawler()
-    result = await crawler.crawl()
+    # result = await crawler.crawl()
+    # print(result)
+
+    result = await crawler.crawl_vaccination()
    print(result)


--- a/tests/test_example.py
+++ b/tests/test_example.py
@@ -3,5 +3,6 @@
 TODO: Remove once other tests have been added.
 """

+
 def test_example():
-  assert True
+    assert True