From 04f46e3893d355abb5ab5ebac25800cd64805f7a Mon Sep 17 00:00:00 2001 From: Adrian Rumpold Date: Tue, 29 Apr 2025 09:34:14 +0200 Subject: [PATCH] Article cross-reference linking --- frontend/src/components/Panel/Panel.tsx | 17 ++++ src/formex_viewer/formex4.py | 101 +++++++++++++++++++++++- src/formex_viewer/server.py | 2 +- 3 files changed, 117 insertions(+), 3 deletions(-) diff --git a/frontend/src/components/Panel/Panel.tsx b/frontend/src/components/Panel/Panel.tsx index b81cefe..b98dcf3 100644 --- a/frontend/src/components/Panel/Panel.tsx +++ b/frontend/src/components/Panel/Panel.tsx @@ -25,6 +25,23 @@ function Panel({ language }: PanelProps) { const articleElement = articleRef.current; if (!articleElement) return; + // Replace cross-reference links with page navigation + const crossRefs = articleElement.querySelectorAll( + "a.cross-ref" + ) as NodeListOf; + crossRefs.forEach((link) => { + const target = link.getAttribute("data-target"); + const targetId = link.getAttribute("data-id"); + + if (target && targetId) { + if (target === "article") { + link.setAttribute("href", `../articles/${targetId}`); + } + } else { + console.warn("No target or ID found for link:", link); + } + }); + const paragraphs = articleElement.querySelectorAll(".paragraph"); // Highlight the selected paragraph diff --git a/src/formex_viewer/formex4.py b/src/formex_viewer/formex4.py index 89ae5f3..0c004af 100644 --- a/src/formex_viewer/formex4.py +++ b/src/formex_viewer/formex4.py @@ -1,10 +1,14 @@ import html import re +import warnings +from dataclasses import dataclass from typing import Optional, Union import lxml.etree from lxml import etree as ET +from formex_viewer.main import Language + def text_content(el: lxml.etree.Element) -> str: """Get the text content of an XML element, including all child elements.""" @@ -23,17 +27,79 @@ def text_content(el: lxml.etree.Element) -> str: return "".join(_iterate(el)) +@dataclass +class CrossReference: + id: str + text: str + target: str + + +def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossReference]: + """Extract cross-references from an XML element. + + Args: + el: The XML element to extract cross-references from. + + Returns: + A dictionary with cross-reference IDs as keys and their text content as values. + """ + crossrefs = [] + text = text_content(el) + + PATTERN_PARTS = { + Language.ENG: { + "article": r"(Art\.|Articles?)", + "annex": r"(Ann\.|Annex)", + "exclusion": r"(?! of(?! this))", + }, + Language.DEU: { + "article": r"(Art\.|Artikels?)", + "annex": r"(Anhang)", + "exclusion": r"(?! von)", + }, + } + + if language not in PATTERN_PARTS: + warnings.warn( + f"Language '{language}' not supported for cross-reference extraction. Returning empty list." + ) + return [] + + # Prevent zealous matching of references to other texts by using a negative lookahead + # Also, match only at word boundaries to prevent partial matches + parts = PATTERN_PARTS[language] + patterns = { + "article": rf"\b{parts["article"]}\s+(?P\d+){parts["exclusion"]}\b", + "annex": rf"\b{parts["annex"]}\s+(?P[DILMVX]+){parts["exclusion"]}\b", + } + for key, pattern in patterns.items(): + matches = re.finditer(pattern, text, flags=re.IGNORECASE) + print(f"Pattern: {pattern}") + for match in matches: + print(f"Match: {match.group(0)}") + crossref_id = ( + match.group("art_num") if key == "article" else match.group("annex_num") + ) + crossref_text = match.group(0) + crossrefs.append( + CrossReference(id=crossref_id, text=crossref_text, target=key) + ) + return crossrefs + + class FormexArticleConverter: """Converts Formex XML
elements to semantic HTML5.""" - def __init__(self, namespace: Optional[str] = None): + def __init__(self, language: Language, namespace: Optional[str] = None): """ Initialize the converter. Args: + language: Language object to determine the language for cross-reference extraction namespace: Optional XML namespace to use when parsing elements """ self.ns = namespace + self.language = language self.ns_prefix = f"{{{namespace}}}" if namespace else "" def _get_tag(self, tag: str) -> str: @@ -67,6 +133,15 @@ class FormexArticleConverter: clean_id = re.sub(r"[^a-zA-Z0-9-]", "-", identifier) return f"art-{clean_id}" + def _replace_xref(self, text: str, xref: CrossReference) -> str: + """Replace a cross-reference instance with semantic markup in the text.""" + # Replace the cross-reference text with a link + text = text.replace( + xref.text, + f'{xref.text}', + ) + return text + def _convert_btx(self, element: ET.Element) -> str: """ Convert basic text elements (t_btx, t_btx.seq) to HTML. @@ -79,6 +154,16 @@ class FormexArticleConverter: result = element.text or "" + is_title = element.tag in ("TI", "STI", "TI.ART", "STI.ART") + if not is_title and not element.getchildren(): + # Cross-references should be treated at the deepest level + xrefs = extract_xrefs(element, self.language) + print("Extracted cross-references: ", xrefs) + + for xref in xrefs: + # Replace the cross-reference text with a link + result = self._replace_xref(result, xref) + for child in element: child_tag = child.tag.replace(self.ns_prefix, "") @@ -176,7 +261,13 @@ class FormexArticleConverter: result += self._convert_btx(child) if child.tail: - result += child.tail + xrefs = extract_xrefs(child, self.language) + tail_text = child.tail + for xref in xrefs: + # Replace the cross-reference text with a link + tail_text = self._replace_xref(tail_text, xref) + + result += tail_text return result @@ -200,6 +291,12 @@ class FormexArticleConverter: if no_p is not None and txt is not None: num = self._get_text(no_p) text = self._get_text(txt) + + # Handle cross-references within the text + xrefs = extract_xrefs(txt, self.language) + for xref in xrefs: + text = self._replace_xref(text, xref) + item_content += f'{num} {text}' elif child_tag == "P": # Regular paragraph diff --git a/src/formex_viewer/server.py b/src/formex_viewer/server.py index 75f7088..b586069 100644 --- a/src/formex_viewer/server.py +++ b/src/formex_viewer/server.py @@ -133,7 +133,7 @@ def article(celex_id: str, article_id: int, language: Language = Language.ENG): num = article.get("IDENTIFIER").lstrip("0") if num == str(article_id): return Response( - FormexArticleConverter().convert_article(article), + FormexArticleConverter(language=language).convert_article(article), media_type="text/html", )