From f83bb077c16533c2a5209c0e059782ba40ae809c Mon Sep 17 00:00:00 2001 From: Adrian Rumpold Date: Tue, 10 Aug 2021 20:06:41 +0200 Subject: [PATCH] feat: Crawling and parsing of vaccination data See #2 --- .../__init__.py | 2 +- .../crawler.py | 134 ++++++++++++++---- .../home_assistant_covid19_augsburg/main.py | 5 +- tests/test_example.py | 3 +- 4 files changed, 111 insertions(+), 33 deletions(-) diff --git a/custom_components/home_assistant_covid19_augsburg/__init__.py b/custom_components/home_assistant_covid19_augsburg/__init__.py index f578184..6263d32 100644 --- a/custom_components/home_assistant_covid19_augsburg/__init__.py +++ b/custom_components/home_assistant_covid19_augsburg/__init__.py @@ -69,7 +69,7 @@ async def get_coordinator(hass: HomeAssistant): async def async_get_data() -> IncidenceData: crawler = CovidCrawler(hass) - return await crawler.crawl() + return await crawler.crawl_incidence() hass.data[DOMAIN] = DataUpdateCoordinator( hass, diff --git a/custom_components/home_assistant_covid19_augsburg/crawler.py b/custom_components/home_assistant_covid19_augsburg/crawler.py index 92dcd1d..5c9a190 100644 --- a/custom_components/home_assistant_covid19_augsburg/crawler.py +++ b/custom_components/home_assistant_covid19_augsburg/crawler.py @@ -15,6 +15,32 @@ def parse_num(s, t=int): return 0 +def parse_date( + day: int, month: str, year=datetime.datetime.now().year +) -> datetime.date: + """Parse a German medium-form date, e.g. 17. August into a datetime.date""" + months = [ + "Januar", + "Februar", + "März", + "April", + "Mai", + "Juni", + "Juli", + "August", + "September", + "Oktober", + "November", + "Dezember", + ] + date = datetime.date( + year=int(year), + month=1 + months.index(month), + day=parse_num(day), + ) + return date + + @dataclass class IncidenceData: location: str @@ -26,37 +52,51 @@ class IncidenceData: num_dead: int = 0 +@dataclass +class VaccinationData: + date: str + + total_vaccinations: int = 0 + num_vaccinated_once: int = 0 + num_vaccinated_full: int = 0 + + ratio_vaccinated_once: float = 0.0 + ratio_vaccinated_full: float = 0.0 + + class CovidCrawlerBase(ABC): @abstractmethod - def crawl(self) -> IncidenceData: + def crawl_incidence(self) -> IncidenceData: + pass + + @abstractmethod + def crawl_vaccination(self) -> VaccinationData: pass class CovidCrawler(CovidCrawlerBase): def __init__(self, hass=None) -> None: - self.url = ( - "https://www.augsburg.de/umwelt-soziales/gesundheit/coronavirus/fallzahlen" - ) self.hass = hass - async def crawl(self) -> IncidenceData: + async def crawl_incidence(self) -> IncidenceData: """ Fetch COVID-19 infection data from the target website. """ _log.info("Fetching COVID-19 data update") + url = ( + "https://www.augsburg.de/umwelt-soziales/gesundheit/coronavirus/fallzahlen" + ) if self.hass: from homeassistant.helpers import aiohttp_client - result = await aiohttp_client.async_get_clientsession(self.hass).get( - self.url - ) + result = await aiohttp_client.async_get_clientsession(self.hass).get(url) soup = BeautifulSoup(await result.text(), "html.parser") else: import requests - result = requests.get(self.url) + result = requests.get(url) if not result.ok: result.raise_for_status() soup = BeautifulSoup(result.text, "html.parser") @@ -79,27 +119,7 @@ class CovidCrawler(CovidCrawlerBase): if not matches: raise ValueError(f"Could not extract date from scraped web page, {text=}") - months = [ - "Januar", - "Februar", - "März", - "April", - "Mai", - "Juni", - "Juli", - "August", - "September", - "Oktober", - "November", - "Dezember", - ] - day = parse_num(matches.group(1)) - month_name = matches.group(2) - date = datetime.date( - year=datetime.datetime.now().year, - month=1 + months.index(month_name), - day=day, - ) + date = parse_date(matches.group(1), matches.group(2)) _log.debug(f"Parsed date: {date}") match = match.find_next_sibling(class_="frame--type-textpic") @@ -130,3 +150,57 @@ class CovidCrawler(CovidCrawlerBase): _log.debug(f"Result data: {result}") return result + + async def crawl_vaccination(self) -> VaccinationData: + _log.info("Fetching COVID-19 vaccination data update") + url = "https://www.augsburg.de/umwelt-sozgcoiales/gesundheit/coronavirus/impfzentrum" + container_id = "c1088140" + + if self.hass: + from homeassistant.helpers import aiohttp_client + + result = await aiohttp_client.async_get_clientsession(self.hass).get(url) + soup = BeautifulSoup(await result.text(), "html.parser") + else: + import requests + + result = requests.get(url) + if not result.ok: + result.raise_for_status() + soup = BeautifulSoup(result.text, "html.parser") + + result = soup.find(id=container_id) + text = re.sub(r"\s+", " ", result.text) + regexes = [ + r"(?P\d+[.]\d+) Impfdosen", + r"Weitere (?P\d+[.]\d+) Personen haben die Erstimpfung erhalten", + r"(?P\d+[.]\d+) Personen sind bereits vollständig geimpft", + ] + values = {} + for r in regexes: + matches = re.search(r, text) + if not matches: + continue + values.update( + { + k: parse_num(v.replace(".", "")) + for k, v in matches.groupdict().items() + } + ) + + matches = re.search(r"Stand (?P\d+)\. (?P\w+) (?P\d+)", text) + if not matches: + raise ValueError(f"Could not extract date from scraped web page, {text=}") + + values["date"] = parse_date(**matches.groupdict()).strftime("%Y-%m-%d") + result = VaccinationData(**values) + + # Total population in Augsburg as of 2020 + # https://www.augsburg.de/fileadmin/user_upload/buergerservice_rathaus/rathaus/statisiken_und_geodaten/statistiken/Monitoring/Demografiemonitoring_der_Stadt_Augsburg_2021.pdf + population = 299021 + + result.ratio_vaccinated_full = result.num_vaccinated_full / population + result.ratio_vaccinated_once = result.num_vaccinated_once / population + _log.debug(f"Result data: {result}") + + return result diff --git a/custom_components/home_assistant_covid19_augsburg/main.py b/custom_components/home_assistant_covid19_augsburg/main.py index 422e50e..4783013 100644 --- a/custom_components/home_assistant_covid19_augsburg/main.py +++ b/custom_components/home_assistant_covid19_augsburg/main.py @@ -3,7 +3,10 @@ from .crawler import CovidCrawler async def main(): crawler = CovidCrawler() - result = await crawler.crawl() + # result = await crawler.crawl() + # print(result) + + result = await crawler.crawl_vaccination() print(result) diff --git a/tests/test_example.py b/tests/test_example.py index b2f43a8..e08bf7d 100644 --- a/tests/test_example.py +++ b/tests/test_example.py @@ -3,5 +3,6 @@ TODO: Remove once other tests have been added. """ + def test_example(): - assert True + assert True