Article cross-reference linking

This commit is contained in:
Adrian Rumpold
2025-04-29 09:34:14 +02:00
parent 9597ccc3bd
commit 04f46e3893
3 changed files with 117 additions and 3 deletions

View File

@@ -1,10 +1,14 @@
import html
import re
import warnings
from dataclasses import dataclass
from typing import Optional, Union
import lxml.etree
from lxml import etree as ET
from formex_viewer.main import Language
def text_content(el: lxml.etree.Element) -> str:
"""Get the text content of an XML element, including all child elements."""
@@ -23,17 +27,79 @@ def text_content(el: lxml.etree.Element) -> str:
return "".join(_iterate(el))
@dataclass
class CrossReference:
id: str
text: str
target: str
def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossReference]:
"""Extract cross-references from an XML element.
Args:
el: The XML element to extract cross-references from.
Returns:
A dictionary with cross-reference IDs as keys and their text content as values.
"""
crossrefs = []
text = text_content(el)
PATTERN_PARTS = {
Language.ENG: {
"article": r"(Art\.|Articles?)",
"annex": r"(Ann\.|Annex)",
"exclusion": r"(?! of(?! this))",
},
Language.DEU: {
"article": r"(Art\.|Artikels?)",
"annex": r"(Anhang)",
"exclusion": r"(?! von)",
},
}
if language not in PATTERN_PARTS:
warnings.warn(
f"Language '{language}' not supported for cross-reference extraction. Returning empty list."
)
return []
# Prevent zealous matching of references to other texts by using a negative lookahead
# Also, match only at word boundaries to prevent partial matches
parts = PATTERN_PARTS[language]
patterns = {
"article": rf"\b{parts["article"]}\s+(?P<art_num>\d+){parts["exclusion"]}\b",
"annex": rf"\b{parts["annex"]}\s+(?P<annex_num>[DILMVX]+){parts["exclusion"]}\b",
}
for key, pattern in patterns.items():
matches = re.finditer(pattern, text, flags=re.IGNORECASE)
print(f"Pattern: {pattern}")
for match in matches:
print(f"Match: {match.group(0)}")
crossref_id = (
match.group("art_num") if key == "article" else match.group("annex_num")
)
crossref_text = match.group(0)
crossrefs.append(
CrossReference(id=crossref_id, text=crossref_text, target=key)
)
return crossrefs
class FormexArticleConverter:
"""Converts Formex XML <ARTICLE> elements to semantic HTML5."""
def __init__(self, namespace: Optional[str] = None):
def __init__(self, language: Language, namespace: Optional[str] = None):
"""
Initialize the converter.
Args:
language: Language object to determine the language for cross-reference extraction
namespace: Optional XML namespace to use when parsing elements
"""
self.ns = namespace
self.language = language
self.ns_prefix = f"{{{namespace}}}" if namespace else ""
def _get_tag(self, tag: str) -> str:
@@ -67,6 +133,15 @@ class FormexArticleConverter:
clean_id = re.sub(r"[^a-zA-Z0-9-]", "-", identifier)
return f"art-{clean_id}"
def _replace_xref(self, text: str, xref: CrossReference) -> str:
"""Replace a cross-reference instance with semantic markup in the text."""
# Replace the cross-reference text with a link
text = text.replace(
xref.text,
f'<a href="" data-target="{xref.target}" data-id="{xref.id}" class="cross-ref">{xref.text}</a>',
)
return text
def _convert_btx(self, element: ET.Element) -> str:
"""
Convert basic text elements (t_btx, t_btx.seq) to HTML.
@@ -79,6 +154,16 @@ class FormexArticleConverter:
result = element.text or ""
is_title = element.tag in ("TI", "STI", "TI.ART", "STI.ART")
if not is_title and not element.getchildren():
# Cross-references should be treated at the deepest level
xrefs = extract_xrefs(element, self.language)
print("Extracted cross-references: ", xrefs)
for xref in xrefs:
# Replace the cross-reference text with a link
result = self._replace_xref(result, xref)
for child in element:
child_tag = child.tag.replace(self.ns_prefix, "")
@@ -176,7 +261,13 @@ class FormexArticleConverter:
result += self._convert_btx(child)
if child.tail:
result += child.tail
xrefs = extract_xrefs(child, self.language)
tail_text = child.tail
for xref in xrefs:
# Replace the cross-reference text with a link
tail_text = self._replace_xref(tail_text, xref)
result += tail_text
return result
@@ -200,6 +291,12 @@ class FormexArticleConverter:
if no_p is not None and txt is not None:
num = self._get_text(no_p)
text = self._get_text(txt)
# Handle cross-references within the text
xrefs = extract_xrefs(txt, self.language)
for xref in xrefs:
text = self._replace_xref(text, xref)
item_content += f'<span class="item-number">{num}</span> {text}'
elif child_tag == "P":
# Regular paragraph