Article cross-reference linking

2025-04-29 09:34:14 +02:00
parent 9597ccc3bd
commit 04f46e3893
3 changed files with 117 additions and 3 deletions
@@ -25,6 +25,23 @@ function Panel({ language }: PanelProps) {
    const articleElement = articleRef.current;
    if (!articleElement) return;
    // Replace cross-reference links with page navigation
    const crossRefs = articleElement.querySelectorAll(
      "a.cross-ref"
    ) as NodeListOf<HTMLAnchorElement>;
    crossRefs.forEach((link) => {
      const target = link.getAttribute("data-target");
      const targetId = link.getAttribute("data-id");
      if (target && targetId) {
        if (target === "article") {
          link.setAttribute("href", `../articles/${targetId}`);
        }
      } else {
        console.warn("No target or ID found for link:", link);
      }
    });
    const paragraphs = articleElement.querySelectorAll(".paragraph");
    // Highlight the selected paragraph
@@ -1,10 +1,14 @@
 import html
 import re
 import warnings
 from dataclasses import dataclass
 from typing import Optional, Union
 import lxml.etree
 from lxml import etree as ET
 from formex_viewer.main import Language
 def text_content(el: lxml.etree.Element) -> str:
    """Get the text content of an XML element, including all child elements."""
@@ -23,17 +27,79 @@ def text_content(el: lxml.etree.Element) -> str:
    return "".join(_iterate(el))
@dataclass
 class CrossReference:
    id: str
    text: str
    target: str
 def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossReference]:
    """Extract cross-references from an XML element.
    Args:
        el: The XML element to extract cross-references from.
    Returns:
        A dictionary with cross-reference IDs as keys and their text content as values.
    """
    crossrefs = []
    text = text_content(el)
    PATTERN_PARTS = {
        Language.ENG: {
            "article": r"(Art\.|Articles?)",
            "annex": r"(Ann\.|Annex)",
            "exclusion": r"(?! of(?! this))",
        },
        Language.DEU: {
            "article": r"(Art\.|Artikels?)",
            "annex": r"(Anhang)",
            "exclusion": r"(?! von)",
        },
    }
    if language not in PATTERN_PARTS:
        warnings.warn(
            f"Language '{language}' not supported for cross-reference extraction. Returning empty list."
        )
        return []
    # Prevent zealous matching of references to other texts by using a negative lookahead
    # Also, match only at word boundaries to prevent partial matches
    parts = PATTERN_PARTS[language]
    patterns = {
        "article": rf"\b{parts["article"]}\s+(?P<art_num>\d+){parts["exclusion"]}\b",
        "annex": rf"\b{parts["annex"]}\s+(?P<annex_num>[DILMVX]+){parts["exclusion"]}\b",
    }
    for key, pattern in patterns.items():
        matches = re.finditer(pattern, text, flags=re.IGNORECASE)
        print(f"Pattern: {pattern}")
        for match in matches:
            print(f"Match: {match.group(0)}")
            crossref_id = (
                match.group("art_num") if key == "article" else match.group("annex_num")
            )
            crossref_text = match.group(0)
            crossrefs.append(
                CrossReference(id=crossref_id, text=crossref_text, target=key)
            )
    return crossrefs
 class FormexArticleConverter:
    """Converts Formex XML <ARTICLE> elements to semantic HTML5."""
-    def __init__(self, namespace: Optional[str] = None):
+    def __init__(self, language: Language, namespace: Optional[str] = None):
        """
        Initialize the converter.
        Args:
            language: Language object to determine the language for cross-reference extraction
            namespace: Optional XML namespace to use when parsing elements
        """
        self.ns = namespace
        self.language = language
        self.ns_prefix = f"{{{namespace}}}" if namespace else ""
    def _get_tag(self, tag: str) -> str:
@@ -67,6 +133,15 @@ class FormexArticleConverter:
        clean_id = re.sub(r"[^a-zA-Z0-9-]", "-", identifier)
        return f"art-{clean_id}"
    def _replace_xref(self, text: str, xref: CrossReference) -> str:
        """Replace a cross-reference instance with semantic markup in the text."""
        # Replace the cross-reference text with a link
        text = text.replace(
            xref.text,
            f'<a href="" data-target="{xref.target}" data-id="{xref.id}" class="cross-ref">{xref.text}</a>',
        )
        return text
    def _convert_btx(self, element: ET.Element) -> str:
        """
        Convert basic text elements (t_btx, t_btx.seq) to HTML.
@@ -79,6 +154,16 @@ class FormexArticleConverter:
        result = element.text or ""
        is_title = element.tag in ("TI", "STI", "TI.ART", "STI.ART")
        if not is_title and not element.getchildren():
            # Cross-references should be treated at the deepest level
            xrefs = extract_xrefs(element, self.language)
            print("Extracted cross-references: ", xrefs)
            for xref in xrefs:
                # Replace the cross-reference text with a link
                result = self._replace_xref(result, xref)
        for child in element:
            child_tag = child.tag.replace(self.ns_prefix, "")
@@ -176,7 +261,13 @@ class FormexArticleConverter:
                result += self._convert_btx(child)
            if child.tail:
-                result += child.tail
+                xrefs = extract_xrefs(child, self.language)
                tail_text = child.tail
                for xref in xrefs:
                    # Replace the cross-reference text with a link
                    tail_text = self._replace_xref(tail_text, xref)
                result += tail_text
        return result
@@ -200,6 +291,12 @@ class FormexArticleConverter:
                    if no_p is not None and txt is not None:
                        num = self._get_text(no_p)
                        text = self._get_text(txt)
                        # Handle cross-references within the text
                        xrefs = extract_xrefs(txt, self.language)
                        for xref in xrefs:
                            text = self._replace_xref(text, xref)
                        item_content += f'<span class="item-number">{num}</span> {text}'
                elif child_tag == "P":
                    # Regular paragraph
@@ -133,7 +133,7 @@ def article(celex_id: str, article_id: int, language: Language = Language.ENG):
        num = article.get("IDENTIFIER").lstrip("0")
        if num == str(article_id):
            return Response(
-                FormexArticleConverter().convert_article(article),
+                FormexArticleConverter(language=language).convert_article(article),
                media_type="text/html",
            )