Working initial version

2025-04-23 11:30:03 +02:00
commit 54a3aba531
34 changed files with 5583 additions and 0 deletions
--- a/src/formex_viewer/init.py
+++ b/src/formex_viewer/init.py
@@ -0,0 +1,2 @@
+def main() -> None:
+    print("Hello from formex-viewer!")
--- a/src/formex_viewer/formex4.py
+++ b/src/formex_viewer/formex4.py
@@ -0,0 +1,367 @@
+import html
+import re
+from typing import Optional, Union
+
+import lxml.etree
+from lxml import etree as ET
+
+
+def text_content(el: lxml.etree.Element) -> str:
+    """Get the text content of an XML element, including all child elements."""
+
+    def _iterate(el):
+        for child in el.iter():
+            if child.text:
+                yield child.text
+            if child.tail:
+                yield child.tail
+        if el.text:
+            yield el.text
+        if el.tail:
+            yield el.tail
+
+    return "".join(_iterate(el))
+
+
+class FormexArticleConverter:
+    """Converts Formex XML <ARTICLE> elements to semantic HTML5."""
+
+    def __init__(self, namespace: Optional[str] = None):
+        """
+        Initialize the converter.
+
+        Args:
+            namespace: Optional XML namespace to use when parsing elements
+        """
+        self.ns = namespace
+        self.ns_prefix = f"{{{namespace}}}" if namespace else ""
+
+    def _get_tag(self, tag: str) -> str:
+        """Get the tag name with namespace if available."""
+        return f"{self.ns_prefix}{tag}"
+
+    def _get_text(self, element: ET.Element) -> str:
+        """Get the text content of an element, including all nested text.
+
+        This uses lxml's text_content() method when available, falling back to
+        manual traversal for other cases.
+        """
+        if element is None:
+            return ""
+
+        # Use lxml's built-in text_content() method which is more efficient
+        try:
+            return element.text_content()
+        except AttributeError:
+            # Fall back to manual traversal if text_content() is not available
+            text = element.text or ""
+            for child in element:
+                text += self._get_text(child)
+                if child.tail:
+                    text += child.tail
+            return text
+
+    def _create_id(self, identifier: str) -> str:
+        """Create a valid HTML ID from the article identifier."""
+        # Clean and normalize the identifier for use as an HTML id
+        clean_id = re.sub(r"[^a-zA-Z0-9-]", "-", identifier)
+        return f"art-{clean_id}"
+
+    def _convert_btx(self, element: ET.Element) -> str:
+        """
+        Convert basic text elements (t_btx, t_btx.seq) to HTML.
+
+        This is a simplified implementation. In a complete version,
+        this would handle all the possible child elements defined in t_btx.
+        """
+        if element is None:
+            return ""
+
+        result = element.text or ""
+
+        for child in element:
+            child_tag = child.tag.replace(self.ns_prefix, "")
+
+            # Process common inline elements
+            if child_tag == "HT":
+                # Handle highlighted text with appropriate HTML tags
+                ht_type = child.get("TYPE", "NORMAL")
+                if ht_type == "BOLD":
+                    result += f"<strong>{self._convert_btx(child)}</strong>"
+                elif ht_type == "ITALIC":
+                    result += f"<em>{self._convert_btx(child)}</em>"
+                elif ht_type == "SUB":
+                    result += f"<sub>{self._convert_btx(child)}</sub>"
+                elif ht_type == "SUP":
+                    result += f"<sup>{self._convert_btx(child)}</sup>"
+                elif ht_type == "UNDERLINE":
+                    result += f"<u>{self._convert_btx(child)}</u>"
+                elif ht_type == "SC":  # Small caps
+                    result += f'<span style="font-variant: small-caps">{self._convert_btx(child)}</span>'
+                else:
+                    result += self._convert_btx(child)
+            elif child_tag == "FT":
+                # Format text (numbers, codes, etc.)
+                ft_type = child.get("TYPE", "")
+                if ft_type == "NUMBER" or ft_type == "DECIMAL":
+                    result += (
+                        f'<span class="ft-number">{self._convert_btx(child)}</span>'
+                    )
+                elif ft_type == "CODE":
+                    result += f"<code>{self._convert_btx(child)}</code>"
+                else:
+                    result += f'<span class="ft-{ft_type.lower()}">{self._convert_btx(child)}</span>'
+            elif child_tag == "IE":
+                # Inclusion/exclusion marker
+                result += '<span class="ie-marker">±</span>'
+            elif child_tag == "BR":
+                # Line break
+                result += "<br>"
+            elif child_tag == "P":
+                # Paragraph
+                result += f"<p>{self._convert_btx(child)}</p>"
+            elif child_tag == "NOTE":
+                # Note reference
+                note_id = child.get("NOTE.ID", "")
+                result += f'<sup class="note-ref" id="{note_id}">{self._convert_btx(child)}</sup>'
+            elif child_tag == "QUOT.START":
+                # Opening quotation mark
+                result += "&ldquo;"
+            elif child_tag == "QUOT.END":
+                # Closing quotation mark
+                result += "&rdquo;"
+            elif child_tag == "LIST":
+                # Handle lists
+
+                # Formex styles to CSS list-style-type mapping
+                list_style_map = {
+                    "ARAB": "decimal",
+                    "ALPHA": "upper-alpha",
+                    "alpha": "lower-alpha",
+                    "ROMAN": "upper-roman",
+                    "roman": "lower-roman",
+                    "BULLET": "disc",
+                    "DASH": "'—'",
+                    "NDASH:": "'–'",
+                    "NONE": "none",
+                    "OTHER": "none",
+                }
+
+                list_type = child.get("TYPE", "BULLET")
+                list_style_type = list_style_map[list_type]
+                if list_type == "BULLET":
+                    result += f"<ul>{self._convert_list(child)}</ul>"
+                elif list_type in ["ARAB", "ALPHA", "alpha", "ROMAN", "roman"]:
+                    result += f"<ol class='list-{list_style_type}'>{self._convert_list(child)}</ol>"
+                else:
+                    result += f"<ul class='list-{list_style_type}'>{self._convert_list(child)}</ul>"
+            elif child_tag == "TXT":
+                # Simple text element
+                result += html.escape(self._get_text(child))
+            elif child_tag == "LINK":
+                # Handle links (added for lxml version)
+                uri = child.get("URI", "#")
+                result += f'<a href="{uri}">{self._convert_btx(child)}</a>'
+            elif child_tag == "REF.DOC.OJ":
+                # Handle references to OJ documents
+                coll = child.get("COLL", "")
+                no_oj = child.get("NO.OJ", "")
+                date = child.get("DATE.PUB", "")
+                page = child.get("PAGE.FIRST", "")
+                result += (
+                    f'<span class="ref-oj">{coll} {no_oj}, {date}, p. {page}</span>'
+                )
+            else:
+                # Recursively process other element types
+                result += self._convert_btx(child)
+
+            if child.tail:
+                result += child.tail
+
+        return result
+
+    def _convert_list(self, list_element: ET.Element) -> str:
+        """Convert a Formex LIST element to HTML list items."""
+        result = ""
+        # Using lxml's xpath to get direct child ITEM elements
+        for item in list_element.xpath(f"./{self._get_tag('ITEM')}"):
+            item_content = ""
+            # Process ITEM contents which should be either NP or P elements
+            for child in item:
+                child_tag = child.tag.replace(self.ns_prefix, "")
+                if child_tag == "NP":
+                    # Numbered paragraph - extract the number and text
+                    no_p_elems = child.xpath(f"./{self._get_tag('NO.P')}")
+                    txt_elems = child.xpath(f"./{self._get_tag('TXT')}")
+
+                    no_p = no_p_elems[0] if no_p_elems else None
+                    txt = txt_elems[0] if txt_elems else None
+
+                    if no_p is not None and txt is not None:
+                        num = self._get_text(no_p)
+                        text = self._get_text(txt)
+                        item_content += f'<span class="item-number">{num}</span> {text}'
+                elif child_tag == "P":
+                    # Regular paragraph
+                    item_content += self._convert_btx(child)
+                else:
+                    # Other elements
+                    item_content += self._convert_btx(child)
+
+            result += f"<li>{item_content}</li>"
+
+        return result
+
+    def _convert_alinea(self, alinea: ET.Element) -> str:
+        """Convert an ALINEA element to HTML."""
+        return f'<p class="alinea">{self._convert_btx(alinea)}</p>'
+
+    def _convert_parag(self, parag: ET.Element) -> str:
+        """Convert a PARAG (paragraph) element to HTML."""
+        identifier = parag.get("IDENTIFIER", "")
+        parag_id = self._create_id(identifier) if identifier else ""
+
+        # Get the paragraph number using XPath
+        no_parag_elems = parag.xpath(f"./{self._get_tag('NO.PARAG')}")
+        parag_num = self._get_text(no_parag_elems[0]) if no_parag_elems else ""
+
+        # Process the alineas within the paragraph
+        content = ""
+        for alinea in parag.xpath(f"./{self._get_tag('ALINEA')}"):
+            content += self._convert_alinea(alinea)
+
+        # Process any comments
+        for comment in parag.xpath(f"./{self._get_tag('COMMENT')}"):
+            content += f'<div class="comment">{self._convert_btx(comment)}</div>'
+
+        # Process any quotations
+        for quot in parag.xpath(f"./{self._get_tag('QUOT.S')}"):
+            content += (
+                f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>'
+            )
+
+        return f'<div class="paragraph" data-paragraph-id="{parag_id}"><span class="paragraph-number">{parag_num}</span>{content}</div>'
+
+    def _convert_subdiv(self, subdiv: ET.Element) -> str:
+        """Convert a SUBDIV (subdivision) element to HTML."""
+        # Get the title using XPath
+        title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}")
+        title = ""
+        if title_elems:
+            title_elem = title_elems[0]
+            # Process TI (title) and STI (subtitle) elements
+            ti_elems = title_elem.xpath(f"./{self._get_tag('TI')}")
+            ti_text = self._convert_btx(ti_elems[0]) if ti_elems else ""
+
+            sti_list = []
+            for sti in title_elem.xpath(f"./{self._get_tag('STI')}"):
+                sti_list.append(self._convert_btx(sti))
+
+            title = f'<h4 class="subdivision-title">{ti_text}</h4>'
+            if sti_list:
+                title += f'<h5 class="subdivision-subtitle">{" ".join(sti_list)}</h5>'
+
+        # Process content: either paragraphs, alineas, or nested subdivisions
+        content = ""
+
+        # Process paragraphs directly under this subdivision
+        for parag in subdiv.xpath(f"./{self._get_tag('PARAG')}"):
+            content += self._convert_parag(parag)
+
+        # Process alineas directly under this subdivision
+        for alinea in subdiv.xpath(f"./{self._get_tag('ALINEA')}"):
+            content += self._convert_alinea(alinea)
+
+        # Process comments directly under this subdivision
+        for comment in subdiv.xpath(f"./{self._get_tag('COMMENT')}"):
+            content += f'<div class="comment">{self._convert_btx(comment)}</div>'
+
+        # Process quotations directly under this subdivision
+        for quot in subdiv.xpath(f"./{self._get_tag('QUOT.S')}"):
+            content += (
+                f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>'
+            )
+
+        # Process nested subdivisions directly under this subdivision
+        for sub in subdiv.xpath(f"./{self._get_tag('SUBDIV')}"):
+            content += self._convert_subdiv(sub)
+
+        return f'<section class="subdivision">{title}{content}</section>'
+
+    def convert_article(self, article: Union[str, ET.Element]) -> str:
+        """
+        Convert a Formex <ARTICLE> element to HTML5.
+
+        Args:
+            article: Either an lxml Element or an XML string representing an ARTICLE
+
+        Returns:
+            A string containing the HTML5 representation of the article
+        """
+        # Parse the article if it's a string
+        if isinstance(article, str):
+            try:
+                parser = ET.XMLParser(remove_blank_text=True)
+                article = ET.fromstring(article.encode("utf-8"), parser)
+            except ET.XMLSyntaxError as e:
+                return f"<p>Error parsing XML: {e}</p>"
+
+        # Extract the article identifier
+        identifier = article.get("IDENTIFIER", "")
+        article_id = self._create_id(identifier)
+
+        # Strip processing instructions
+        ET.strip_tags(article, lxml.etree.PI)
+
+        # Extract the article title
+        # Use lxml's xpath capabilities for better namespace handling
+        ti_art = article.xpath(f".//{self._get_tag('TI.ART')}")
+        ti_art = ti_art[0] if ti_art else None
+        article_title = self._convert_btx(ti_art) if ti_art is not None else ""
+
+        # Extract the article subtitle if present
+        sti_art = article.xpath(f".//{self._get_tag('STI.ART')}")
+        sti_art = sti_art[0] if sti_art else None
+        article_subtitle = self._convert_btx(sti_art) if sti_art is not None else ""
+
+        # Build the header section
+        header = f'<header><h3 class="article-title">{article_title}</h3>'
+        if article_subtitle:
+            header += f'<h4 class="article-subtitle">{article_subtitle}</h4>'
+        header += "</header>"
+
+        # Process the content based on what's present
+        content = ""
+
+        # Check if we have alineas directly under the article
+        alineas = article.xpath(f"./{self._get_tag('ALINEA')}")
+        if alineas:
+            for alinea in alineas:
+                content += self._convert_alinea(alinea)
+
+        # Check if we have paragraphs directly under the article
+        parags = article.xpath(f"./{self._get_tag('PARAG')}")
+        if parags:
+            for parag in parags:
+                content += self._convert_parag(parag)
+
+        # Check for comments directly under the article
+        comments = article.xpath(f"./{self._get_tag('COMMENT')}")
+        if comments:
+            for comment in comments:
+                content += f'<div class="comment">{self._convert_btx(comment)}</div>'
+
+        # Check for quotations directly under the article
+        quots = article.xpath(f"./{self._get_tag('QUOT.S')}")
+        if quots:
+            for quot in quots:
+                content += f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>'
+
+        # Check for subdivisions directly under the article
+        subdivs = article.xpath(f"./{self._get_tag('SUBDIV')}")
+        if subdivs:
+            for subdiv in subdivs:
+                content += self._convert_subdiv(subdiv)
+
+        # Assemble the complete article
+        return f'<article id="{article_id}" class="formex-article">{header}<div class="article-content">{content}</div></article>'
--- a/src/formex_viewer/main.py
+++ b/src/formex_viewer/main.py
@@ -0,0 +1,232 @@
+import io
+import logging
+import typing
+import zipfile
+from dataclasses import dataclass
+from enum import StrEnum
+from pathlib import Path
+
+import httpx
+from lxml import etree
+
+from formex_viewer import formex4
+
+
+class SystemName(StrEnum):
+    CELEX = "celex"
+    CELLAR = "cellar"
+    OJ = "oj"
+
+
+@dataclass
+class CellarIdentifier:
+    system_name: SystemName
+    system_id: str
+
+
+class Language(StrEnum):
+    """Language enum for Cellar"""
+
+    ENG = "eng"
+    DEU = "deu"
+    FRA = "fra"
+    ITA = "ita"
+
+
+class ContentType(StrEnum):
+    XML_FMX4 = "application/xml;mtype=fmx4"
+    ZIP_FMX4 = "application/zip;mtype=fmx4"
+    XHTML_XML = "application/xhtml+xml"
+
+
+@dataclass
+class Metadata:
+    publication_text: CellarIdentifier
+
+    @classmethod
+    def from_xml(cls, xmlstr: str):
+        """Parse XML metadata"""
+
+        tree = etree.fromstring(xmlstr.encode("utf-8"))
+        url = tree.xpath("//NOTICE/EXPRESSION/URI")[0]
+
+        return cls(
+            publication_text=CellarIdentifier(
+                system_name=SystemName(url.find("TYPE").text),
+                system_id=url.find("IDENTIFIER").text,
+            )
+        )
+
+
+def extract_fmx_main_publication(zip_data: typing.BinaryIO) -> str:
+    # Find the main document in a Formex 4 ZIP archive
+    #
+    # Algorithm:
+    # 1. Find the XML file containing the document descriptor (*.doc.fmx.xml)
+    # 2. Parse the XML file to find the main publication (XPath `/DOC/FMX/DOC.MAIN.PUB/REF.PHYS`)
+    # 3. Extract the file from the ZIP archive using the `FILE` attribute value
+
+    with zipfile.ZipFile(zip_data) as z:
+        # Find the document descriptor XML file
+        doc_xml_files = [
+            f for f in z.namelist() if f.endswith((".doc.fmx.xml", ".doc.xml"))
+        ]
+        if not doc_xml_files:
+            logging.info("ZIP file contents: %s", z.namelist())
+            raise ValueError(
+                "No document descriptor (*.doc.xml / *.doc.fmx.xml) found in the archive"
+            )
+
+        doc_xml_file = doc_xml_files[0]
+
+        # Parse the XML file
+        with z.open(doc_xml_file) as f:
+            tree = etree.parse(f)
+
+        # Find the main publication reference
+        main_pub_ref = tree.xpath('/DOC/FMX/DOC.MAIN.PUB/REF.PHYS[@TYPE="DOC.XML"]')
+        if not main_pub_ref:
+            raise ValueError(
+                "Main publication reference not found in document descriptor"
+            )
+
+        # Get the FILE attribute
+        main_file = main_pub_ref[0].get("FILE")
+        if not main_file:
+            raise ValueError("FILE attribute not found in main publication reference")
+
+        # Extract the file content
+        with z.open(main_file) as f:
+            return f.read().decode("utf-8")
+
+
+class CellarClient:
+    def __init__(self, language: Language = Language.ENG):
+        self._client = httpx.Client(
+            base_url="http://publications.europa.eu", follow_redirects=True
+        )
+        self.language = language
+
+    def metadata(self, cellar_id: CellarIdentifier):
+        """Fetch metadata from Cellar"""
+
+        resp = self._client.get(
+            f"/resource/{cellar_id.system_name}/{cellar_id.system_id}",
+            headers={
+                "Accept": "application/xml;notice=object",
+                "Accept-Language": self.language,
+            },
+        )
+        resp.raise_for_status()
+        return Metadata.from_xml(resp.text)
+
+    def publication_text(
+        self,
+        cellar_id: CellarIdentifier,
+        content_type: ContentType,
+    ) -> str:
+        """Fetch a publication from Cellar"""
+
+        metadata = self.metadata(cellar_id)
+        identifier = metadata.publication_text
+
+        resp = self._client.get(
+            f"/resource/{identifier.system_name}/{identifier.system_id}",
+            headers={
+                "Accept": content_type,
+                "Accept-Language": self.language,
+            },
+        )
+        resp.raise_for_status()
+
+        if "zip" in resp.headers.get("Content-Type", ""):
+            return extract_fmx_main_publication(io.BytesIO(resp.content))
+        else:
+            return resp.text
+
+
+def _get_path(
+    base_path: Path,
+    typ: typing.Literal["recital", "article", "annex"],
+    num: str,
+    extension: str,
+) -> Path:
+    """Get the path for a given type and number"""
+
+    plurals = {
+        "recital": "recitals",
+        "article": "articles",
+        "annex": "annexes",
+    }
+
+    return (base_path / plurals[typ] / f"{num}").with_suffix(extension)
+
+
+class HtmlConverter:
+    """Convert a Formex 4 XML document to HTML files"""
+
+    extension = ".html"
+
+    def __init__(self, fmx4_content: str, outdir: Path):
+        self._fmx4 = fmx4_content
+        self._xml = etree.fromstring(self._fmx4.encode("utf-8"))
+        self._outdir = outdir
+        self._outdir.mkdir(parents=True, exist_ok=True)
+
+    def convert(self) -> str:
+        """Split the publication text into separate files.
+
+        Structure:
+        - recitals/: All recitals, one per file
+        - articles/: All articles, one per file
+        - annexes/: All annexes, one per file
+        """
+
+        # Extract recitals
+        recital_xpath = "//GR.CONSID/CONSID/NP"
+        recitals = self._xml.xpath(recital_xpath)
+        for recital in recitals:
+            num_str = recital.find("NO.P").text
+            num = num_str.strip("()")
+
+            filename = _get_path(self._outdir, "recital", num, self.extension)
+            filename.parent.mkdir(parents=True, exist_ok=True)
+            txt = formex4.text_content(recital.find("TXT"))
+            if txt is None:
+                logging.warning("Recital %s has no text", num)
+                continue
+            filename.write_text(txt, encoding="utf-8")
+
+        # Extract articles
+        # Extract recitals
+        article_xpath = "//ARTICLE"
+        articles = self._xml.xpath(article_xpath)
+        for article in articles:
+            num = article.get("IDENTIFIER").lstrip("0")
+
+            filename = _get_path(self._outdir, "article", num, self.extension)
+            filename.parent.mkdir(parents=True, exist_ok=True)
+
+            txt = formex4.FormexArticleConverter().convert_article(article)
+            filename.write_text(txt, encoding="utf-8")
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+
+    # Cyber Resilience Act - CELEX 32024R2847
+    cra_id = CellarIdentifier(system_name=SystemName.CELEX, system_id="32024R2847")
+
+    # AI Act - CELEX 32024R1689
+    aia_id = CellarIdentifier(system_name=SystemName.CELEX, system_id="32024R1689")
+
+    content_type = ContentType.ZIP_FMX4
+    language = Language.ENG
+
+    client = CellarClient(language=language)
+
+    tmpdir = Path("tmp")
+    tmpdir.mkdir(parents=True, exist_ok=True)
+    fmx4_text = client.publication_text(cra_id, content_type, language)
+    converter = HtmlConverter(fmx4_text, Path(tmpdir))
+    converter.convert()
--- a/src/formex_viewer/parser.py
+++ b/src/formex_viewer/parser.py
--- a/src/formex_viewer/server.py
+++ b/src/formex_viewer/server.py
@@ -0,0 +1,82 @@
+import lxml.etree as ET
+from fastapi import FastAPI, Response
+from fastapi.middleware.cors import CORSMiddleware
+
+from formex_viewer.formex4 import FormexArticleConverter
+from formex_viewer.main import (
+    CellarClient,
+    CellarIdentifier,
+    ContentType,
+    Language,
+    SystemName,
+)
+
+origins = [
+    "http://localhost:5173",
+]
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+type CacheKey = tuple[str, Language]
+
+CACHE: dict[CacheKey, str] = {}
+
+
+def _get_fmx4_data(celex_id: str, language: Language) -> str:
+    """
+    Fetch the FMX4 data from the server.
+    """
+
+    if (celex_id, language) in CACHE:
+        return CACHE[(celex_id, language)]
+
+    client = CellarClient(language)
+    cellar_id = CellarIdentifier(
+        system_name=SystemName.CELEX,
+        system_id=celex_id,
+    )
+    fmx4_data = client.publication_text(cellar_id, ContentType.ZIP_FMX4)
+
+    CACHE[(celex_id, language)] = fmx4_data
+
+    return fmx4_data
+
+
+@app.get("/{celex_id}/articles")
+def article_ids(celex_id: str, language: Language = Language.ENG):
+    """
+    Fetch the article IDs from the server.
+    """
+    fmx4_data = _get_fmx4_data(celex_id, language)
+    xml = ET.fromstring(fmx4_data.encode("utf-8"))
+
+    article_xpath = "//ARTICLE/@IDENTIFIER"
+    article_ids = xml.xpath(article_xpath)
+    article_ids = [int(article_id.lstrip("0")) for article_id in article_ids]
+    article_ids.sort()
+    return article_ids
+
+
+@app.get("/{celex_id}/articles/{article_id}/{language}")
+def article(celex_id: str, article_id: int, language: Language = Language.ENG):
+    """
+    Fetch an article from the server.
+    """
+    fmx4_data = _get_fmx4_data(celex_id, language)
+    xml = ET.fromstring(fmx4_data.encode("utf-8"))
+
+    article_xpath = "//ARTICLE"
+    articles = xml.xpath(article_xpath)
+    for article in articles:
+        num = article.get("IDENTIFIER").lstrip("0")
+        if num == str(article_id):
+            return Response(
+                FormexArticleConverter().convert_article(article),
+                media_type="text/html",
+            )