feat: Crawling and parsing of vaccination data

See #2
This commit is contained in:
Adrian Rumpold
2021-08-10 20:06:41 +02:00
parent 8a97e92458
commit f83bb077c1
4 changed files with 111 additions and 33 deletions

View File

@@ -69,7 +69,7 @@ async def get_coordinator(hass: HomeAssistant):
async def async_get_data() -> IncidenceData:
crawler = CovidCrawler(hass)
return await crawler.crawl()
return await crawler.crawl_incidence()
hass.data[DOMAIN] = DataUpdateCoordinator(
hass,

View File

@@ -15,6 +15,32 @@ def parse_num(s, t=int):
return 0
def parse_date(
day: int, month: str, year=datetime.datetime.now().year
) -> datetime.date:
"""Parse a German medium-form date, e.g. 17. August into a datetime.date"""
months = [
"Januar",
"Februar",
"März",
"April",
"Mai",
"Juni",
"Juli",
"August",
"September",
"Oktober",
"November",
"Dezember",
]
date = datetime.date(
year=int(year),
month=1 + months.index(month),
day=parse_num(day),
)
return date
@dataclass
class IncidenceData:
location: str
@@ -26,37 +52,51 @@ class IncidenceData:
num_dead: int = 0
@dataclass
class VaccinationData:
date: str
total_vaccinations: int = 0
num_vaccinated_once: int = 0
num_vaccinated_full: int = 0
ratio_vaccinated_once: float = 0.0
ratio_vaccinated_full: float = 0.0
class CovidCrawlerBase(ABC):
@abstractmethod
def crawl(self) -> IncidenceData:
def crawl_incidence(self) -> IncidenceData:
pass
@abstractmethod
def crawl_vaccination(self) -> VaccinationData:
pass
class CovidCrawler(CovidCrawlerBase):
def __init__(self, hass=None) -> None:
self.url = (
"https://www.augsburg.de/umwelt-soziales/gesundheit/coronavirus/fallzahlen"
)
self.hass = hass
async def crawl(self) -> IncidenceData:
async def crawl_incidence(self) -> IncidenceData:
"""
Fetch COVID-19 infection data from the target website.
"""
_log.info("Fetching COVID-19 data update")
url = (
"https://www.augsburg.de/umwelt-soziales/gesundheit/coronavirus/fallzahlen"
)
if self.hass:
from homeassistant.helpers import aiohttp_client
result = await aiohttp_client.async_get_clientsession(self.hass).get(
self.url
)
result = await aiohttp_client.async_get_clientsession(self.hass).get(url)
soup = BeautifulSoup(await result.text(), "html.parser")
else:
import requests
result = requests.get(self.url)
result = requests.get(url)
if not result.ok:
result.raise_for_status()
soup = BeautifulSoup(result.text, "html.parser")
@@ -79,27 +119,7 @@ class CovidCrawler(CovidCrawlerBase):
if not matches:
raise ValueError(f"Could not extract date from scraped web page, {text=}")
months = [
"Januar",
"Februar",
"März",
"April",
"Mai",
"Juni",
"Juli",
"August",
"September",
"Oktober",
"November",
"Dezember",
]
day = parse_num(matches.group(1))
month_name = matches.group(2)
date = datetime.date(
year=datetime.datetime.now().year,
month=1 + months.index(month_name),
day=day,
)
date = parse_date(matches.group(1), matches.group(2))
_log.debug(f"Parsed date: {date}")
match = match.find_next_sibling(class_="frame--type-textpic")
@@ -130,3 +150,57 @@ class CovidCrawler(CovidCrawlerBase):
_log.debug(f"Result data: {result}")
return result
async def crawl_vaccination(self) -> VaccinationData:
_log.info("Fetching COVID-19 vaccination data update")
url = "https://www.augsburg.de/umwelt-sozgcoiales/gesundheit/coronavirus/impfzentrum"
container_id = "c1088140"
if self.hass:
from homeassistant.helpers import aiohttp_client
result = await aiohttp_client.async_get_clientsession(self.hass).get(url)
soup = BeautifulSoup(await result.text(), "html.parser")
else:
import requests
result = requests.get(url)
if not result.ok:
result.raise_for_status()
soup = BeautifulSoup(result.text, "html.parser")
result = soup.find(id=container_id)
text = re.sub(r"\s+", " ", result.text)
regexes = [
r"(?P<total_vaccinations>\d+[.]\d+) Impfdosen",
r"Weitere (?P<num_vaccinated_once>\d+[.]\d+) Personen haben die Erstimpfung erhalten",
r"(?P<num_vaccinated_full>\d+[.]\d+) Personen sind bereits vollständig geimpft",
]
values = {}
for r in regexes:
matches = re.search(r, text)
if not matches:
continue
values.update(
{
k: parse_num(v.replace(".", ""))
for k, v in matches.groupdict().items()
}
)
matches = re.search(r"Stand (?P<day>\d+)\. (?P<month>\w+) (?P<year>\d+)", text)
if not matches:
raise ValueError(f"Could not extract date from scraped web page, {text=}")
values["date"] = parse_date(**matches.groupdict()).strftime("%Y-%m-%d")
result = VaccinationData(**values)
# Total population in Augsburg as of 2020
# https://www.augsburg.de/fileadmin/user_upload/buergerservice_rathaus/rathaus/statisiken_und_geodaten/statistiken/Monitoring/Demografiemonitoring_der_Stadt_Augsburg_2021.pdf
population = 299021
result.ratio_vaccinated_full = result.num_vaccinated_full / population
result.ratio_vaccinated_once = result.num_vaccinated_once / population
_log.debug(f"Result data: {result}")
return result

View File

@@ -3,7 +3,10 @@ from .crawler import CovidCrawler
async def main():
crawler = CovidCrawler()
result = await crawler.crawl()
# result = await crawler.crawl()
# print(result)
result = await crawler.crawl_vaccination()
print(result)

View File

@@ -3,5 +3,6 @@
TODO: Remove once other tests have been added.
"""
def test_example():
assert True
assert True