Files
home-assistant-covid19-augs…/custom_components/home_assistant_covid19_augsburg/crawler.py
Adrian Rumpold 3c4768eab8 Initial commit
2021-06-18 14:12:00 +02:00

95 lines
2.6 KiB
Python

import datetime
import locale
import logging
import re
from abc import ABC, abstractmethod
from dataclasses import dataclass
import requests
from bs4 import BeautifulSoup
_log = logging.getLogger(__name__)
@dataclass
class IncidenceData:
location: str
date: datetime.date
incidence: float
total_cases: int = 0
num_infected: int = 0
num_recovered: int = 0
num_dead: int = 0
class CovidCrawlerBase(ABC):
@abstractmethod
def crawl(self) -> IncidenceData:
pass
class CovidCrawler(CovidCrawlerBase):
def __init__(self) -> None:
self.url = (
"https://www.augsburg.de/umwelt-soziales/gesundheit/coronavirus/fallzahlen"
)
def crawl(self) -> IncidenceData:
"""
Fetch COVID-19 infection data from the target website.
"""
_log.info("Fetching COVID-19 data update")
locale.setlocale(locale.LC_ALL, "de_DE.utf8")
result = requests.get(self.url)
if not result.ok:
result.raise_for_status()
soup = BeautifulSoup(result.text, features="html.parser")
match = soup.find(class_="frame--type-textpic")
text = match.p.text
_log.debug(f"Infection data text: {text}")
matches = re.search(r"(\d+,\d+) Neuinfektion", text)
if not matches:
raise ValueError("Could not extract incidence from scraped web page")
incidence = locale.atof(matches.group(1))
_log.debug(f"Parsed incidence: {incidence}")
text = match.h2.text
matches = re.search(r"\((\d+\. \w+)\)", text)
if not matches:
raise ValueError("Could not extract date from scraped web page")
date = datetime.datetime.strptime(matches.group(1), "%d. %B")
date = date.replace(year=datetime.datetime.now().year).date()
_log.debug(f"Parsed date: {date}")
match = match.find_next_sibling(class_="frame--type-textpic")
text = match.text
_log.debug(f"Infection counts text: {text}")
regexes = [
r"Insgesamt: (?P<total_cases>[0-9.]+)",
r"genesen: (?P<num_recovered>[0-9.]+)",
r"infiziert: (?P<num_infected>[0-9.]+)",
r"verstorben: (?P<num_dead>[0-9.]+)",
]
cases = {}
for r in regexes:
matches = re.search(r, text)
if not matches:
continue
cases.update(
{k: int(v.replace(".", "")) for k, v in matches.groupdict().items()}
)
result = IncidenceData("Augsburg", incidence, date, **cases)
_log.debug(f"Result data: {result}")
return result