import io import logging import typing import zipfile from dataclasses import dataclass from enum import StrEnum from pathlib import Path import httpx from lxml import etree from formex_viewer import formex4 class SystemName(StrEnum): CELEX = "celex" CELLAR = "cellar" OJ = "oj" @dataclass class CellarIdentifier: system_name: SystemName system_id: str class Language(StrEnum): """Language enum for Cellar""" ENG = "eng" DEU = "deu" FRA = "fra" ITA = "ita" class ContentType(StrEnum): XML_FMX4 = "application/xml;mtype=fmx4" ZIP_FMX4 = "application/zip;mtype=fmx4" XHTML_XML = "application/xhtml+xml" @dataclass class Metadata: publication_text: CellarIdentifier @classmethod def from_xml(cls, xmlstr: str): """Parse XML metadata""" tree = etree.fromstring(xmlstr.encode("utf-8")) url = tree.xpath("//NOTICE/EXPRESSION/URI")[0] return cls( publication_text=CellarIdentifier( system_name=SystemName(url.find("TYPE").text), system_id=url.find("IDENTIFIER").text, ) ) def extract_fmx_main_publication(zip_data: typing.BinaryIO) -> str: # Find the main document in a Formex 4 ZIP archive # # Algorithm: # 1. Find the XML file containing the document descriptor (*.doc.fmx.xml) # 2. Parse the XML file to find the main publication (XPath `/DOC/FMX/DOC.MAIN.PUB/REF.PHYS`) # 3. Extract the file from the ZIP archive using the `FILE` attribute value with zipfile.ZipFile(zip_data) as z: # Find the document descriptor XML file doc_xml_files = [ f for f in z.namelist() if f.endswith((".doc.fmx.xml", ".doc.xml")) ] if not doc_xml_files: logging.info("ZIP file contents: %s", z.namelist()) raise ValueError( "No document descriptor (*.doc.xml / *.doc.fmx.xml) found in the archive" ) doc_xml_file = doc_xml_files[0] # Parse the XML file with z.open(doc_xml_file) as f: tree = etree.parse(f) # Find the main publication reference main_pub_ref = tree.xpath('/DOC/FMX/DOC.MAIN.PUB/REF.PHYS[@TYPE="DOC.XML"]') if not main_pub_ref: raise ValueError( "Main publication reference not found in document descriptor" ) # Get the FILE attribute main_file = main_pub_ref[0].get("FILE") if not main_file: raise ValueError("FILE attribute not found in main publication reference") # Extract the file content with z.open(main_file) as f: return f.read().decode("utf-8") class CellarClient: def __init__(self, language: Language = Language.ENG): self._client = httpx.Client( base_url="http://publications.europa.eu", follow_redirects=True ) self.language = language def metadata(self, cellar_id: CellarIdentifier): """Fetch metadata from Cellar""" resp = self._client.get( f"/resource/{cellar_id.system_name}/{cellar_id.system_id}", headers={ "Accept": "application/xml;notice=object", "Accept-Language": self.language, }, ) resp.raise_for_status() return Metadata.from_xml(resp.text) def publication_text( self, cellar_id: CellarIdentifier, content_type: ContentType, ) -> str: """Fetch a publication from Cellar""" metadata = self.metadata(cellar_id) identifier = metadata.publication_text resp = self._client.get( f"/resource/{identifier.system_name}/{identifier.system_id}", headers={ "Accept": content_type, "Accept-Language": self.language, }, ) resp.raise_for_status() if "zip" in resp.headers.get("Content-Type", ""): return extract_fmx_main_publication(io.BytesIO(resp.content)) else: return resp.text def _get_path( base_path: Path, typ: typing.Literal["recital", "article", "annex"], num: str, extension: str, ) -> Path: """Get the path for a given type and number""" plurals = { "recital": "recitals", "article": "articles", "annex": "annexes", } return (base_path / plurals[typ] / f"{num}").with_suffix(extension) class HtmlConverter: """Convert a Formex 4 XML document to HTML files""" extension = ".html" def __init__(self, fmx4_content: str, outdir: Path): self._fmx4 = fmx4_content self._xml = etree.fromstring(self._fmx4.encode("utf-8")) self._outdir = outdir self._outdir.mkdir(parents=True, exist_ok=True) def convert(self) -> str: """Split the publication text into separate files. Structure: - recitals/: All recitals, one per file - articles/: All articles, one per file - annexes/: All annexes, one per file """ # Extract recitals recital_xpath = "//GR.CONSID/CONSID/NP" recitals = self._xml.xpath(recital_xpath) for recital in recitals: num_str = recital.find("NO.P").text num = num_str.strip("()") filename = _get_path(self._outdir, "recital", num, self.extension) filename.parent.mkdir(parents=True, exist_ok=True) txt = formex4.text_content(recital.find("TXT")) if txt is None: logging.warning("Recital %s has no text", num) continue filename.write_text(txt, encoding="utf-8") # Extract articles # Extract recitals article_xpath = "//ARTICLE" articles = self._xml.xpath(article_xpath) for article in articles: num = article.get("IDENTIFIER").lstrip("0") filename = _get_path(self._outdir, "article", num, self.extension) filename.parent.mkdir(parents=True, exist_ok=True) txt = formex4.FormexArticleConverter().convert_article(article) filename.write_text(txt, encoding="utf-8") if __name__ == "__main__": logging.basicConfig(level=logging.INFO) # Cyber Resilience Act - CELEX 32024R2847 cra_id = CellarIdentifier(system_name=SystemName.CELEX, system_id="32024R2847") # AI Act - CELEX 32024R1689 aia_id = CellarIdentifier(system_name=SystemName.CELEX, system_id="32024R1689") content_type = ContentType.ZIP_FMX4 language = Language.ENG client = CellarClient(language=language) tmpdir = Path("tmp") tmpdir.mkdir(parents=True, exist_ok=True) fmx4_text = client.publication_text(cra_id, content_type, language) converter = HtmlConverter(fmx4_text, Path(tmpdir)) converter.convert()