Article cross-reference linking

This commit is contained in:
Adrian Rumpold
2025-04-29 09:34:14 +02:00
parent 9597ccc3bd
commit 04f46e3893
3 changed files with 117 additions and 3 deletions

View File

@@ -25,6 +25,23 @@ function Panel({ language }: PanelProps) {
const articleElement = articleRef.current; const articleElement = articleRef.current;
if (!articleElement) return; if (!articleElement) return;
// Replace cross-reference links with page navigation
const crossRefs = articleElement.querySelectorAll(
"a.cross-ref"
) as NodeListOf<HTMLAnchorElement>;
crossRefs.forEach((link) => {
const target = link.getAttribute("data-target");
const targetId = link.getAttribute("data-id");
if (target && targetId) {
if (target === "article") {
link.setAttribute("href", `../articles/${targetId}`);
}
} else {
console.warn("No target or ID found for link:", link);
}
});
const paragraphs = articleElement.querySelectorAll(".paragraph"); const paragraphs = articleElement.querySelectorAll(".paragraph");
// Highlight the selected paragraph // Highlight the selected paragraph

View File

@@ -1,10 +1,14 @@
import html import html
import re import re
import warnings
from dataclasses import dataclass
from typing import Optional, Union from typing import Optional, Union
import lxml.etree import lxml.etree
from lxml import etree as ET from lxml import etree as ET
from formex_viewer.main import Language
def text_content(el: lxml.etree.Element) -> str: def text_content(el: lxml.etree.Element) -> str:
"""Get the text content of an XML element, including all child elements.""" """Get the text content of an XML element, including all child elements."""
@@ -23,17 +27,79 @@ def text_content(el: lxml.etree.Element) -> str:
return "".join(_iterate(el)) return "".join(_iterate(el))
@dataclass
class CrossReference:
id: str
text: str
target: str
def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossReference]:
"""Extract cross-references from an XML element.
Args:
el: The XML element to extract cross-references from.
Returns:
A dictionary with cross-reference IDs as keys and their text content as values.
"""
crossrefs = []
text = text_content(el)
PATTERN_PARTS = {
Language.ENG: {
"article": r"(Art\.|Articles?)",
"annex": r"(Ann\.|Annex)",
"exclusion": r"(?! of(?! this))",
},
Language.DEU: {
"article": r"(Art\.|Artikels?)",
"annex": r"(Anhang)",
"exclusion": r"(?! von)",
},
}
if language not in PATTERN_PARTS:
warnings.warn(
f"Language '{language}' not supported for cross-reference extraction. Returning empty list."
)
return []
# Prevent zealous matching of references to other texts by using a negative lookahead
# Also, match only at word boundaries to prevent partial matches
parts = PATTERN_PARTS[language]
patterns = {
"article": rf"\b{parts["article"]}\s+(?P<art_num>\d+){parts["exclusion"]}\b",
"annex": rf"\b{parts["annex"]}\s+(?P<annex_num>[DILMVX]+){parts["exclusion"]}\b",
}
for key, pattern in patterns.items():
matches = re.finditer(pattern, text, flags=re.IGNORECASE)
print(f"Pattern: {pattern}")
for match in matches:
print(f"Match: {match.group(0)}")
crossref_id = (
match.group("art_num") if key == "article" else match.group("annex_num")
)
crossref_text = match.group(0)
crossrefs.append(
CrossReference(id=crossref_id, text=crossref_text, target=key)
)
return crossrefs
class FormexArticleConverter: class FormexArticleConverter:
"""Converts Formex XML <ARTICLE> elements to semantic HTML5.""" """Converts Formex XML <ARTICLE> elements to semantic HTML5."""
def __init__(self, namespace: Optional[str] = None): def __init__(self, language: Language, namespace: Optional[str] = None):
""" """
Initialize the converter. Initialize the converter.
Args: Args:
language: Language object to determine the language for cross-reference extraction
namespace: Optional XML namespace to use when parsing elements namespace: Optional XML namespace to use when parsing elements
""" """
self.ns = namespace self.ns = namespace
self.language = language
self.ns_prefix = f"{{{namespace}}}" if namespace else "" self.ns_prefix = f"{{{namespace}}}" if namespace else ""
def _get_tag(self, tag: str) -> str: def _get_tag(self, tag: str) -> str:
@@ -67,6 +133,15 @@ class FormexArticleConverter:
clean_id = re.sub(r"[^a-zA-Z0-9-]", "-", identifier) clean_id = re.sub(r"[^a-zA-Z0-9-]", "-", identifier)
return f"art-{clean_id}" return f"art-{clean_id}"
def _replace_xref(self, text: str, xref: CrossReference) -> str:
"""Replace a cross-reference instance with semantic markup in the text."""
# Replace the cross-reference text with a link
text = text.replace(
xref.text,
f'<a href="" data-target="{xref.target}" data-id="{xref.id}" class="cross-ref">{xref.text}</a>',
)
return text
def _convert_btx(self, element: ET.Element) -> str: def _convert_btx(self, element: ET.Element) -> str:
""" """
Convert basic text elements (t_btx, t_btx.seq) to HTML. Convert basic text elements (t_btx, t_btx.seq) to HTML.
@@ -79,6 +154,16 @@ class FormexArticleConverter:
result = element.text or "" result = element.text or ""
is_title = element.tag in ("TI", "STI", "TI.ART", "STI.ART")
if not is_title and not element.getchildren():
# Cross-references should be treated at the deepest level
xrefs = extract_xrefs(element, self.language)
print("Extracted cross-references: ", xrefs)
for xref in xrefs:
# Replace the cross-reference text with a link
result = self._replace_xref(result, xref)
for child in element: for child in element:
child_tag = child.tag.replace(self.ns_prefix, "") child_tag = child.tag.replace(self.ns_prefix, "")
@@ -176,7 +261,13 @@ class FormexArticleConverter:
result += self._convert_btx(child) result += self._convert_btx(child)
if child.tail: if child.tail:
result += child.tail xrefs = extract_xrefs(child, self.language)
tail_text = child.tail
for xref in xrefs:
# Replace the cross-reference text with a link
tail_text = self._replace_xref(tail_text, xref)
result += tail_text
return result return result
@@ -200,6 +291,12 @@ class FormexArticleConverter:
if no_p is not None and txt is not None: if no_p is not None and txt is not None:
num = self._get_text(no_p) num = self._get_text(no_p)
text = self._get_text(txt) text = self._get_text(txt)
# Handle cross-references within the text
xrefs = extract_xrefs(txt, self.language)
for xref in xrefs:
text = self._replace_xref(text, xref)
item_content += f'<span class="item-number">{num}</span> {text}' item_content += f'<span class="item-number">{num}</span> {text}'
elif child_tag == "P": elif child_tag == "P":
# Regular paragraph # Regular paragraph

View File

@@ -133,7 +133,7 @@ def article(celex_id: str, article_id: int, language: Language = Language.ENG):
num = article.get("IDENTIFIER").lstrip("0") num = article.get("IDENTIFIER").lstrip("0")
if num == str(article_id): if num == str(article_id):
return Response( return Response(
FormexArticleConverter().convert_article(article), FormexArticleConverter(language=language).convert_article(article),
media_type="text/html", media_type="text/html",
) )