Paragraph-level links, preview tooltips

2025-04-30 12:04:38 +02:00
parent ea7885eeee
commit 7dd913df7b
25 changed files with 569 additions and 102 deletions
--- a/src/formex_viewer/formex4.py
+++ b/src/formex_viewer/formex4.py
@@ -2,7 +2,7 @@ import html
 import re
 import warnings
 from dataclasses import dataclass
-from typing import Optional, Union
+from typing import Literal, Optional, Union

 import lxml.etree
 from lxml import etree as ET
@@ -29,9 +29,10 @@ def text_content(el: lxml.etree.Element) -> str:

@dataclass
 class CrossReference:
-    id: str
+    target: Literal["article", "annex"]
    text: str
-    target: str
+    id: str
+    paragraph: int | None = None


 def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossReference]:
@@ -69,8 +70,8 @@ def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossRefer
    # Also, match only at word boundaries to prevent partial matches
    parts = PATTERN_PARTS[language]
    patterns = {
-        "article": rf"\b{parts["article"]}\s+(?P<art_num>\d+){parts["exclusion"]}\b",
-        "annex": rf"\b{parts["annex"]}\s+(?P<annex_num>[DILMVX]+){parts["exclusion"]}\b",
+        "article": rf"\b{parts["article"]}\s+(?P<art_num>\d+)(?:[(](?P<parag_num>\d+)[)])?(?:{parts["exclusion"]})",
+        "annex": rf"\b{parts["annex"]}\s+(?P<annex_num>[DILMVX]+)(?:{parts["exclusion"]})",
    }
    for key, pattern in patterns.items():
        matches = re.finditer(pattern, text, flags=re.IGNORECASE)
@@ -78,13 +79,54 @@ def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossRefer
            crossref_id = (
                match.group("art_num") if key == "article" else match.group("annex_num")
            )
+            parag_num = match.groupdict().get("parag_num")
            crossref_text = match.group(0)
            crossrefs.append(
-                CrossReference(id=crossref_id, text=crossref_text, target=key)
+                CrossReference(
+                    target=key,
+                    id=crossref_id,
+                    paragraph=parag_num,
+                    text=crossref_text,
+                )
            )
    return crossrefs


+def extract_article(doc: ET.ElementBase, article_id: int) -> ET.ElementBase | None:
+    """Extract a specific article from a Formex document.
+
+    Args:
+        doc: The XML document to extract from.
+        article_id: The article number.
+
+    Returns:
+        The extracted article element.
+    """
+
+    # Use XPath to find the specific article
+    xpath = f".//ARTICLE[@IDENTIFIER='{article_id:03d}']"
+    return doc.xpath(xpath)[0] if doc.xpath(xpath) else None
+
+
+def extract_paragraph(
+    doc: ET.ElementBase, article_id: int, paragraph_id: int
+) -> ET.ElementBase | None:
+    """Extract a specific paragraph from an article in a Formex document.
+
+    Args:
+        doc: The XML document to extract from.
+        article_id: The article number.
+        paragraph_id: The paragraph number.
+
+    Returns:
+        The extracted paragraph element.
+    """
+
+    # Use XPath to find the specific paragraph
+    xpath = f".//PARAG[@IDENTIFIER='{article_id:03d}.{paragraph_id:03d}']"
+    return doc.xpath(xpath)[0] if doc.xpath(xpath) else None
+
+
 class FormexArticleConverter:
    """Converts Formex XML <ARTICLE> elements to semantic HTML5."""

@@ -136,7 +178,7 @@ class FormexArticleConverter:
        # Replace the cross-reference text with a link
        text = text.replace(
            xref.text,
-            f'<a href="" data-target="{xref.target}" data-id="{xref.id}" class="cross-ref">{xref.text}</a>',
+            f'<a href="" data-target="{xref.target}" data-id="{xref.id}" data-paragraph-id="{xref.paragraph or ''}" class="cross-ref">{xref.text}</a>',
        )
        return text

@@ -418,10 +460,13 @@ class FormexArticleConverter:
        article_subtitle = self._convert_btx(sti_art) if sti_art is not None else ""

        # Build the header section
-        header = f'<header><h3 class="article-title">{article_title}</h3>'
-        if article_subtitle:
-            header += f'<h4 class="article-subtitle">{article_subtitle}</h4>'
-        header += "</header>"
+        if article_title and article_subtitle:
+            header = f'<header><h3 class="article-title">{article_title}</h3>'
+            if article_subtitle:
+                header += f'<h4 class="article-subtitle">{article_subtitle}</h4>'
+            header += "</header>"
+        else:
+            header = ""

        # Process the content based on what's present
        content = ""