Paragraph-level links, preview tooltips

2025-04-30 12:04:38 +02:00
parent ea7885eeee
commit 7dd913df7b
25 changed files with 569 additions and 102 deletions
@@ -2,7 +2,7 @@ import html
 import re
 import warnings
 from dataclasses import dataclass
-from typing import Optional, Union
+from typing import Literal, Optional, Union

 import lxml.etree
 from lxml import etree as ET
@@ -29,9 +29,10 @@ def text_content(el: lxml.etree.Element) -> str:

@dataclass
 class CrossReference:
-    id: str
+    target: Literal["article", "annex"]
    text: str
-    target: str
+    id: str
+    paragraph: int | None = None


 def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossReference]:
@@ -69,8 +70,8 @@ def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossRefer
    # Also, match only at word boundaries to prevent partial matches
    parts = PATTERN_PARTS[language]
    patterns = {
-        "article": rf"\b{parts["article"]}\s+(?P<art_num>\d+){parts["exclusion"]}\b",
-        "annex": rf"\b{parts["annex"]}\s+(?P<annex_num>[DILMVX]+){parts["exclusion"]}\b",
+        "article": rf"\b{parts["article"]}\s+(?P<art_num>\d+)(?:[(](?P<parag_num>\d+)[)])?(?:{parts["exclusion"]})",
+        "annex": rf"\b{parts["annex"]}\s+(?P<annex_num>[DILMVX]+)(?:{parts["exclusion"]})",
    }
    for key, pattern in patterns.items():
        matches = re.finditer(pattern, text, flags=re.IGNORECASE)
@@ -78,13 +79,54 @@ def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossRefer
            crossref_id = (
                match.group("art_num") if key == "article" else match.group("annex_num")
            )
+            parag_num = match.groupdict().get("parag_num")
            crossref_text = match.group(0)
            crossrefs.append(
-                CrossReference(id=crossref_id, text=crossref_text, target=key)
+                CrossReference(
+                    target=key,
+                    id=crossref_id,
+                    paragraph=parag_num,
+                    text=crossref_text,
+                )
            )
    return crossrefs


+def extract_article(doc: ET.ElementBase, article_id: int) -> ET.ElementBase | None:
+    """Extract a specific article from a Formex document.
+
+    Args:
+        doc: The XML document to extract from.
+        article_id: The article number.
+
+    Returns:
+        The extracted article element.
+    """
+
+    # Use XPath to find the specific article
+    xpath = f".//ARTICLE[@IDENTIFIER='{article_id:03d}']"
+    return doc.xpath(xpath)[0] if doc.xpath(xpath) else None
+
+
+def extract_paragraph(
+    doc: ET.ElementBase, article_id: int, paragraph_id: int
+) -> ET.ElementBase | None:
+    """Extract a specific paragraph from an article in a Formex document.
+
+    Args:
+        doc: The XML document to extract from.
+        article_id: The article number.
+        paragraph_id: The paragraph number.
+
+    Returns:
+        The extracted paragraph element.
+    """
+
+    # Use XPath to find the specific paragraph
+    xpath = f".//PARAG[@IDENTIFIER='{article_id:03d}.{paragraph_id:03d}']"
+    return doc.xpath(xpath)[0] if doc.xpath(xpath) else None
+
+
 class FormexArticleConverter:
    """Converts Formex XML <ARTICLE> elements to semantic HTML5."""

@@ -136,7 +178,7 @@ class FormexArticleConverter:
        # Replace the cross-reference text with a link
        text = text.replace(
            xref.text,
-            f'<a href="" data-target="{xref.target}" data-id="{xref.id}" class="cross-ref">{xref.text}</a>',
+            f'<a href="" data-target="{xref.target}" data-id="{xref.id}" data-paragraph-id="{xref.paragraph or ''}" class="cross-ref">{xref.text}</a>',
        )
        return text

@@ -418,10 +460,13 @@ class FormexArticleConverter:
        article_subtitle = self._convert_btx(sti_art) if sti_art is not None else ""

        # Build the header section
-        header = f'<header><h3 class="article-title">{article_title}</h3>'
-        if article_subtitle:
-            header += f'<h4 class="article-subtitle">{article_subtitle}</h4>'
-        header += "</header>"
+        if article_title and article_subtitle:
+            header = f'<header><h3 class="article-title">{article_title}</h3>'
+            if article_subtitle:
+                header += f'<h4 class="article-subtitle">{article_subtitle}</h4>'
+            header += "</header>"
+        else:
+            header = ""

        # Process the content based on what's present
        content = ""
@@ -2,7 +2,11 @@ import lxml.etree as ET
 from fastapi import APIRouter, FastAPI, Response
 from fastapi.middleware.cors import CORSMiddleware

-from formex_viewer.formex4 import FormexArticleConverter
+from formex_viewer.formex4 import (
+    FormexArticleConverter,
+    extract_article,
+    extract_paragraph,
+)
 from formex_viewer.main import (
    CellarClient,
    CellarIdentifier,
@@ -121,21 +125,46 @@ def toc(celex_id: str, language: Language = Language.ENG):


@api_router.get("/{celex_id}/articles/{article_id}/{language}")
-def article(celex_id: str, article_id: int, language: Language = Language.ENG):
+def article(
+    celex_id: str,
+    article_id: int,
+    language: Language = Language.ENG,
+):
    """
    Fetch an article from the server.
    """
    xml = _get_fmx4_data(celex_id, language)
+    article = extract_article(xml, article_id=article_id)

-    article_xpath = "//ARTICLE"
-    articles = xml.xpath(article_xpath)
-    for article in articles:
-        num = article.get("IDENTIFIER").lstrip("0")
-        if num == str(article_id):
-            return Response(
-                FormexArticleConverter(language=language).convert_article(article),
-                media_type="text/html",
-            )
+    if article is None:
+        return Response(
+            "Article not found",
+            status_code=404,
+        )
+
+    return Response(
+        FormexArticleConverter(language=language).convert_article(article),
+        media_type="text/html",
+    )
+
+
+@api_router.get("/{celex_id}/articles/{article_id}/{parag_id}/{language}")
+def paragraph(
+    celex_id: str,
+    article_id: int,
+    parag_id: int,
+    language: Language = Language.ENG,
+):
+    """
+    Fetch a paragraph within an article from the server.
+    """
+    xml = _get_fmx4_data(celex_id, language)
+    parag = extract_paragraph(xml, article_id=article_id, paragraph_id=parag_id)
+
+    return Response(
+        FormexArticleConverter(language=language).convert_article(parag),
+        media_type="text/html",
+    )


 app.include_router(api_router, prefix="/api")