Article cross-reference linking

2025-04-29 09:34:14 +02:00
parent 9597ccc3bd
commit 04f46e3893
3 changed files with 117 additions and 3 deletions
--- a/src/formex_viewer/formex4.py
+++ b/src/formex_viewer/formex4.py
@@ -1,10 +1,14 @@
 import html
 import re
+import warnings
+from dataclasses import dataclass
 from typing import Optional, Union

 import lxml.etree
 from lxml import etree as ET

+from formex_viewer.main import Language
+

 def text_content(el: lxml.etree.Element) -> str:
    """Get the text content of an XML element, including all child elements."""
@@ -23,17 +27,79 @@ def text_content(el: lxml.etree.Element) -> str:
    return "".join(_iterate(el))


+@dataclass
+class CrossReference:
+    id: str
+    text: str
+    target: str
+
+
+def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossReference]:
+    """Extract cross-references from an XML element.
+
+    Args:
+        el: The XML element to extract cross-references from.
+
+    Returns:
+        A dictionary with cross-reference IDs as keys and their text content as values.
+    """
+    crossrefs = []
+    text = text_content(el)
+
+    PATTERN_PARTS = {
+        Language.ENG: {
+            "article": r"(Art\.|Articles?)",
+            "annex": r"(Ann\.|Annex)",
+            "exclusion": r"(?! of(?! this))",
+        },
+        Language.DEU: {
+            "article": r"(Art\.|Artikels?)",
+            "annex": r"(Anhang)",
+            "exclusion": r"(?! von)",
+        },
+    }
+
+    if language not in PATTERN_PARTS:
+        warnings.warn(
+            f"Language '{language}' not supported for cross-reference extraction. Returning empty list."
+        )
+        return []
+
+    # Prevent zealous matching of references to other texts by using a negative lookahead
+    # Also, match only at word boundaries to prevent partial matches
+    parts = PATTERN_PARTS[language]
+    patterns = {
+        "article": rf"\b{parts["article"]}\s+(?P<art_num>\d+){parts["exclusion"]}\b",
+        "annex": rf"\b{parts["annex"]}\s+(?P<annex_num>[DILMVX]+){parts["exclusion"]}\b",
+    }
+    for key, pattern in patterns.items():
+        matches = re.finditer(pattern, text, flags=re.IGNORECASE)
+        print(f"Pattern: {pattern}")
+        for match in matches:
+            print(f"Match: {match.group(0)}")
+            crossref_id = (
+                match.group("art_num") if key == "article" else match.group("annex_num")
+            )
+            crossref_text = match.group(0)
+            crossrefs.append(
+                CrossReference(id=crossref_id, text=crossref_text, target=key)
+            )
+    return crossrefs
+
+
 class FormexArticleConverter:
    """Converts Formex XML <ARTICLE> elements to semantic HTML5."""

-    def __init__(self, namespace: Optional[str] = None):
+    def __init__(self, language: Language, namespace: Optional[str] = None):
        """
        Initialize the converter.

        Args:
+            language: Language object to determine the language for cross-reference extraction
            namespace: Optional XML namespace to use when parsing elements
        """
        self.ns = namespace
+        self.language = language
        self.ns_prefix = f"{{{namespace}}}" if namespace else ""

    def _get_tag(self, tag: str) -> str:
@@ -67,6 +133,15 @@ class FormexArticleConverter:
        clean_id = re.sub(r"[^a-zA-Z0-9-]", "-", identifier)
        return f"art-{clean_id}"

+    def _replace_xref(self, text: str, xref: CrossReference) -> str:
+        """Replace a cross-reference instance with semantic markup in the text."""
+        # Replace the cross-reference text with a link
+        text = text.replace(
+            xref.text,
+            f'<a href="" data-target="{xref.target}" data-id="{xref.id}" class="cross-ref">{xref.text}</a>',
+        )
+        return text
+
    def _convert_btx(self, element: ET.Element) -> str:
        """
        Convert basic text elements (t_btx, t_btx.seq) to HTML.
@@ -79,6 +154,16 @@ class FormexArticleConverter:

        result = element.text or ""

+        is_title = element.tag in ("TI", "STI", "TI.ART", "STI.ART")
+        if not is_title and not element.getchildren():
+            # Cross-references should be treated at the deepest level
+            xrefs = extract_xrefs(element, self.language)
+            print("Extracted cross-references: ", xrefs)
+
+            for xref in xrefs:
+                # Replace the cross-reference text with a link
+                result = self._replace_xref(result, xref)
+
        for child in element:
            child_tag = child.tag.replace(self.ns_prefix, "")

@@ -176,7 +261,13 @@ class FormexArticleConverter:
                result += self._convert_btx(child)

            if child.tail:
-                result += child.tail
+                xrefs = extract_xrefs(child, self.language)
+                tail_text = child.tail
+                for xref in xrefs:
+                    # Replace the cross-reference text with a link
+                    tail_text = self._replace_xref(tail_text, xref)
+
+                result += tail_text

        return result

@@ -200,6 +291,12 @@ class FormexArticleConverter:
                    if no_p is not None and txt is not None:
                        num = self._get_text(no_p)
                        text = self._get_text(txt)
+
+                        # Handle cross-references within the text
+                        xrefs = extract_xrefs(txt, self.language)
+                        for xref in xrefs:
+                            text = self._replace_xref(text, xref)
+
                        item_content += f'<span class="item-number">{num}</span> {text}'
                elif child_tag == "P":
                    # Regular paragraph
--- a/src/formex_viewer/server.py
+++ b/src/formex_viewer/server.py
@@ -133,7 +133,7 @@ def article(celex_id: str, article_id: int, language: Language = Language.ENG):
        num = article.get("IDENTIFIER").lstrip("0")
        if num == str(article_id):
            return Response(
-                FormexArticleConverter().convert_article(article),
+                FormexArticleConverter(language=language).convert_article(article),
                media_type="text/html",
            )