Article cross-reference linking
This commit is contained in:
@@ -25,6 +25,23 @@ function Panel({ language }: PanelProps) {
|
||||
const articleElement = articleRef.current;
|
||||
if (!articleElement) return;
|
||||
|
||||
// Replace cross-reference links with page navigation
|
||||
const crossRefs = articleElement.querySelectorAll(
|
||||
"a.cross-ref"
|
||||
) as NodeListOf<HTMLAnchorElement>;
|
||||
crossRefs.forEach((link) => {
|
||||
const target = link.getAttribute("data-target");
|
||||
const targetId = link.getAttribute("data-id");
|
||||
|
||||
if (target && targetId) {
|
||||
if (target === "article") {
|
||||
link.setAttribute("href", `../articles/${targetId}`);
|
||||
}
|
||||
} else {
|
||||
console.warn("No target or ID found for link:", link);
|
||||
}
|
||||
});
|
||||
|
||||
const paragraphs = articleElement.querySelectorAll(".paragraph");
|
||||
|
||||
// Highlight the selected paragraph
|
||||
|
||||
@@ -1,10 +1,14 @@
|
||||
import html
|
||||
import re
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Union
|
||||
|
||||
import lxml.etree
|
||||
from lxml import etree as ET
|
||||
|
||||
from formex_viewer.main import Language
|
||||
|
||||
|
||||
def text_content(el: lxml.etree.Element) -> str:
|
||||
"""Get the text content of an XML element, including all child elements."""
|
||||
@@ -23,17 +27,79 @@ def text_content(el: lxml.etree.Element) -> str:
|
||||
return "".join(_iterate(el))
|
||||
|
||||
|
||||
@dataclass
|
||||
class CrossReference:
|
||||
id: str
|
||||
text: str
|
||||
target: str
|
||||
|
||||
|
||||
def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossReference]:
|
||||
"""Extract cross-references from an XML element.
|
||||
|
||||
Args:
|
||||
el: The XML element to extract cross-references from.
|
||||
|
||||
Returns:
|
||||
A dictionary with cross-reference IDs as keys and their text content as values.
|
||||
"""
|
||||
crossrefs = []
|
||||
text = text_content(el)
|
||||
|
||||
PATTERN_PARTS = {
|
||||
Language.ENG: {
|
||||
"article": r"(Art\.|Articles?)",
|
||||
"annex": r"(Ann\.|Annex)",
|
||||
"exclusion": r"(?! of(?! this))",
|
||||
},
|
||||
Language.DEU: {
|
||||
"article": r"(Art\.|Artikels?)",
|
||||
"annex": r"(Anhang)",
|
||||
"exclusion": r"(?! von)",
|
||||
},
|
||||
}
|
||||
|
||||
if language not in PATTERN_PARTS:
|
||||
warnings.warn(
|
||||
f"Language '{language}' not supported for cross-reference extraction. Returning empty list."
|
||||
)
|
||||
return []
|
||||
|
||||
# Prevent zealous matching of references to other texts by using a negative lookahead
|
||||
# Also, match only at word boundaries to prevent partial matches
|
||||
parts = PATTERN_PARTS[language]
|
||||
patterns = {
|
||||
"article": rf"\b{parts["article"]}\s+(?P<art_num>\d+){parts["exclusion"]}\b",
|
||||
"annex": rf"\b{parts["annex"]}\s+(?P<annex_num>[DILMVX]+){parts["exclusion"]}\b",
|
||||
}
|
||||
for key, pattern in patterns.items():
|
||||
matches = re.finditer(pattern, text, flags=re.IGNORECASE)
|
||||
print(f"Pattern: {pattern}")
|
||||
for match in matches:
|
||||
print(f"Match: {match.group(0)}")
|
||||
crossref_id = (
|
||||
match.group("art_num") if key == "article" else match.group("annex_num")
|
||||
)
|
||||
crossref_text = match.group(0)
|
||||
crossrefs.append(
|
||||
CrossReference(id=crossref_id, text=crossref_text, target=key)
|
||||
)
|
||||
return crossrefs
|
||||
|
||||
|
||||
class FormexArticleConverter:
|
||||
"""Converts Formex XML <ARTICLE> elements to semantic HTML5."""
|
||||
|
||||
def __init__(self, namespace: Optional[str] = None):
|
||||
def __init__(self, language: Language, namespace: Optional[str] = None):
|
||||
"""
|
||||
Initialize the converter.
|
||||
|
||||
Args:
|
||||
language: Language object to determine the language for cross-reference extraction
|
||||
namespace: Optional XML namespace to use when parsing elements
|
||||
"""
|
||||
self.ns = namespace
|
||||
self.language = language
|
||||
self.ns_prefix = f"{{{namespace}}}" if namespace else ""
|
||||
|
||||
def _get_tag(self, tag: str) -> str:
|
||||
@@ -67,6 +133,15 @@ class FormexArticleConverter:
|
||||
clean_id = re.sub(r"[^a-zA-Z0-9-]", "-", identifier)
|
||||
return f"art-{clean_id}"
|
||||
|
||||
def _replace_xref(self, text: str, xref: CrossReference) -> str:
|
||||
"""Replace a cross-reference instance with semantic markup in the text."""
|
||||
# Replace the cross-reference text with a link
|
||||
text = text.replace(
|
||||
xref.text,
|
||||
f'<a href="" data-target="{xref.target}" data-id="{xref.id}" class="cross-ref">{xref.text}</a>',
|
||||
)
|
||||
return text
|
||||
|
||||
def _convert_btx(self, element: ET.Element) -> str:
|
||||
"""
|
||||
Convert basic text elements (t_btx, t_btx.seq) to HTML.
|
||||
@@ -79,6 +154,16 @@ class FormexArticleConverter:
|
||||
|
||||
result = element.text or ""
|
||||
|
||||
is_title = element.tag in ("TI", "STI", "TI.ART", "STI.ART")
|
||||
if not is_title and not element.getchildren():
|
||||
# Cross-references should be treated at the deepest level
|
||||
xrefs = extract_xrefs(element, self.language)
|
||||
print("Extracted cross-references: ", xrefs)
|
||||
|
||||
for xref in xrefs:
|
||||
# Replace the cross-reference text with a link
|
||||
result = self._replace_xref(result, xref)
|
||||
|
||||
for child in element:
|
||||
child_tag = child.tag.replace(self.ns_prefix, "")
|
||||
|
||||
@@ -176,7 +261,13 @@ class FormexArticleConverter:
|
||||
result += self._convert_btx(child)
|
||||
|
||||
if child.tail:
|
||||
result += child.tail
|
||||
xrefs = extract_xrefs(child, self.language)
|
||||
tail_text = child.tail
|
||||
for xref in xrefs:
|
||||
# Replace the cross-reference text with a link
|
||||
tail_text = self._replace_xref(tail_text, xref)
|
||||
|
||||
result += tail_text
|
||||
|
||||
return result
|
||||
|
||||
@@ -200,6 +291,12 @@ class FormexArticleConverter:
|
||||
if no_p is not None and txt is not None:
|
||||
num = self._get_text(no_p)
|
||||
text = self._get_text(txt)
|
||||
|
||||
# Handle cross-references within the text
|
||||
xrefs = extract_xrefs(txt, self.language)
|
||||
for xref in xrefs:
|
||||
text = self._replace_xref(text, xref)
|
||||
|
||||
item_content += f'<span class="item-number">{num}</span> {text}'
|
||||
elif child_tag == "P":
|
||||
# Regular paragraph
|
||||
|
||||
@@ -133,7 +133,7 @@ def article(celex_id: str, article_id: int, language: Language = Language.ENG):
|
||||
num = article.get("IDENTIFIER").lstrip("0")
|
||||
if num == str(article_id):
|
||||
return Response(
|
||||
FormexArticleConverter().convert_article(article),
|
||||
FormexArticleConverter(language=language).convert_article(article),
|
||||
media_type="text/html",
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user