Article cross-reference linking
This commit is contained in:
@@ -25,6 +25,23 @@ function Panel({ language }: PanelProps) {
|
|||||||
const articleElement = articleRef.current;
|
const articleElement = articleRef.current;
|
||||||
if (!articleElement) return;
|
if (!articleElement) return;
|
||||||
|
|
||||||
|
// Replace cross-reference links with page navigation
|
||||||
|
const crossRefs = articleElement.querySelectorAll(
|
||||||
|
"a.cross-ref"
|
||||||
|
) as NodeListOf<HTMLAnchorElement>;
|
||||||
|
crossRefs.forEach((link) => {
|
||||||
|
const target = link.getAttribute("data-target");
|
||||||
|
const targetId = link.getAttribute("data-id");
|
||||||
|
|
||||||
|
if (target && targetId) {
|
||||||
|
if (target === "article") {
|
||||||
|
link.setAttribute("href", `../articles/${targetId}`);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
console.warn("No target or ID found for link:", link);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
const paragraphs = articleElement.querySelectorAll(".paragraph");
|
const paragraphs = articleElement.querySelectorAll(".paragraph");
|
||||||
|
|
||||||
// Highlight the selected paragraph
|
// Highlight the selected paragraph
|
||||||
|
|||||||
@@ -1,10 +1,14 @@
|
|||||||
import html
|
import html
|
||||||
import re
|
import re
|
||||||
|
import warnings
|
||||||
|
from dataclasses import dataclass
|
||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
|
|
||||||
import lxml.etree
|
import lxml.etree
|
||||||
from lxml import etree as ET
|
from lxml import etree as ET
|
||||||
|
|
||||||
|
from formex_viewer.main import Language
|
||||||
|
|
||||||
|
|
||||||
def text_content(el: lxml.etree.Element) -> str:
|
def text_content(el: lxml.etree.Element) -> str:
|
||||||
"""Get the text content of an XML element, including all child elements."""
|
"""Get the text content of an XML element, including all child elements."""
|
||||||
@@ -23,17 +27,79 @@ def text_content(el: lxml.etree.Element) -> str:
|
|||||||
return "".join(_iterate(el))
|
return "".join(_iterate(el))
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CrossReference:
|
||||||
|
id: str
|
||||||
|
text: str
|
||||||
|
target: str
|
||||||
|
|
||||||
|
|
||||||
|
def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossReference]:
|
||||||
|
"""Extract cross-references from an XML element.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
el: The XML element to extract cross-references from.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A dictionary with cross-reference IDs as keys and their text content as values.
|
||||||
|
"""
|
||||||
|
crossrefs = []
|
||||||
|
text = text_content(el)
|
||||||
|
|
||||||
|
PATTERN_PARTS = {
|
||||||
|
Language.ENG: {
|
||||||
|
"article": r"(Art\.|Articles?)",
|
||||||
|
"annex": r"(Ann\.|Annex)",
|
||||||
|
"exclusion": r"(?! of(?! this))",
|
||||||
|
},
|
||||||
|
Language.DEU: {
|
||||||
|
"article": r"(Art\.|Artikels?)",
|
||||||
|
"annex": r"(Anhang)",
|
||||||
|
"exclusion": r"(?! von)",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
if language not in PATTERN_PARTS:
|
||||||
|
warnings.warn(
|
||||||
|
f"Language '{language}' not supported for cross-reference extraction. Returning empty list."
|
||||||
|
)
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Prevent zealous matching of references to other texts by using a negative lookahead
|
||||||
|
# Also, match only at word boundaries to prevent partial matches
|
||||||
|
parts = PATTERN_PARTS[language]
|
||||||
|
patterns = {
|
||||||
|
"article": rf"\b{parts["article"]}\s+(?P<art_num>\d+){parts["exclusion"]}\b",
|
||||||
|
"annex": rf"\b{parts["annex"]}\s+(?P<annex_num>[DILMVX]+){parts["exclusion"]}\b",
|
||||||
|
}
|
||||||
|
for key, pattern in patterns.items():
|
||||||
|
matches = re.finditer(pattern, text, flags=re.IGNORECASE)
|
||||||
|
print(f"Pattern: {pattern}")
|
||||||
|
for match in matches:
|
||||||
|
print(f"Match: {match.group(0)}")
|
||||||
|
crossref_id = (
|
||||||
|
match.group("art_num") if key == "article" else match.group("annex_num")
|
||||||
|
)
|
||||||
|
crossref_text = match.group(0)
|
||||||
|
crossrefs.append(
|
||||||
|
CrossReference(id=crossref_id, text=crossref_text, target=key)
|
||||||
|
)
|
||||||
|
return crossrefs
|
||||||
|
|
||||||
|
|
||||||
class FormexArticleConverter:
|
class FormexArticleConverter:
|
||||||
"""Converts Formex XML <ARTICLE> elements to semantic HTML5."""
|
"""Converts Formex XML <ARTICLE> elements to semantic HTML5."""
|
||||||
|
|
||||||
def __init__(self, namespace: Optional[str] = None):
|
def __init__(self, language: Language, namespace: Optional[str] = None):
|
||||||
"""
|
"""
|
||||||
Initialize the converter.
|
Initialize the converter.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
language: Language object to determine the language for cross-reference extraction
|
||||||
namespace: Optional XML namespace to use when parsing elements
|
namespace: Optional XML namespace to use when parsing elements
|
||||||
"""
|
"""
|
||||||
self.ns = namespace
|
self.ns = namespace
|
||||||
|
self.language = language
|
||||||
self.ns_prefix = f"{{{namespace}}}" if namespace else ""
|
self.ns_prefix = f"{{{namespace}}}" if namespace else ""
|
||||||
|
|
||||||
def _get_tag(self, tag: str) -> str:
|
def _get_tag(self, tag: str) -> str:
|
||||||
@@ -67,6 +133,15 @@ class FormexArticleConverter:
|
|||||||
clean_id = re.sub(r"[^a-zA-Z0-9-]", "-", identifier)
|
clean_id = re.sub(r"[^a-zA-Z0-9-]", "-", identifier)
|
||||||
return f"art-{clean_id}"
|
return f"art-{clean_id}"
|
||||||
|
|
||||||
|
def _replace_xref(self, text: str, xref: CrossReference) -> str:
|
||||||
|
"""Replace a cross-reference instance with semantic markup in the text."""
|
||||||
|
# Replace the cross-reference text with a link
|
||||||
|
text = text.replace(
|
||||||
|
xref.text,
|
||||||
|
f'<a href="" data-target="{xref.target}" data-id="{xref.id}" class="cross-ref">{xref.text}</a>',
|
||||||
|
)
|
||||||
|
return text
|
||||||
|
|
||||||
def _convert_btx(self, element: ET.Element) -> str:
|
def _convert_btx(self, element: ET.Element) -> str:
|
||||||
"""
|
"""
|
||||||
Convert basic text elements (t_btx, t_btx.seq) to HTML.
|
Convert basic text elements (t_btx, t_btx.seq) to HTML.
|
||||||
@@ -79,6 +154,16 @@ class FormexArticleConverter:
|
|||||||
|
|
||||||
result = element.text or ""
|
result = element.text or ""
|
||||||
|
|
||||||
|
is_title = element.tag in ("TI", "STI", "TI.ART", "STI.ART")
|
||||||
|
if not is_title and not element.getchildren():
|
||||||
|
# Cross-references should be treated at the deepest level
|
||||||
|
xrefs = extract_xrefs(element, self.language)
|
||||||
|
print("Extracted cross-references: ", xrefs)
|
||||||
|
|
||||||
|
for xref in xrefs:
|
||||||
|
# Replace the cross-reference text with a link
|
||||||
|
result = self._replace_xref(result, xref)
|
||||||
|
|
||||||
for child in element:
|
for child in element:
|
||||||
child_tag = child.tag.replace(self.ns_prefix, "")
|
child_tag = child.tag.replace(self.ns_prefix, "")
|
||||||
|
|
||||||
@@ -176,7 +261,13 @@ class FormexArticleConverter:
|
|||||||
result += self._convert_btx(child)
|
result += self._convert_btx(child)
|
||||||
|
|
||||||
if child.tail:
|
if child.tail:
|
||||||
result += child.tail
|
xrefs = extract_xrefs(child, self.language)
|
||||||
|
tail_text = child.tail
|
||||||
|
for xref in xrefs:
|
||||||
|
# Replace the cross-reference text with a link
|
||||||
|
tail_text = self._replace_xref(tail_text, xref)
|
||||||
|
|
||||||
|
result += tail_text
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@@ -200,6 +291,12 @@ class FormexArticleConverter:
|
|||||||
if no_p is not None and txt is not None:
|
if no_p is not None and txt is not None:
|
||||||
num = self._get_text(no_p)
|
num = self._get_text(no_p)
|
||||||
text = self._get_text(txt)
|
text = self._get_text(txt)
|
||||||
|
|
||||||
|
# Handle cross-references within the text
|
||||||
|
xrefs = extract_xrefs(txt, self.language)
|
||||||
|
for xref in xrefs:
|
||||||
|
text = self._replace_xref(text, xref)
|
||||||
|
|
||||||
item_content += f'<span class="item-number">{num}</span> {text}'
|
item_content += f'<span class="item-number">{num}</span> {text}'
|
||||||
elif child_tag == "P":
|
elif child_tag == "P":
|
||||||
# Regular paragraph
|
# Regular paragraph
|
||||||
|
|||||||
@@ -133,7 +133,7 @@ def article(celex_id: str, article_id: int, language: Language = Language.ENG):
|
|||||||
num = article.get("IDENTIFIER").lstrip("0")
|
num = article.get("IDENTIFIER").lstrip("0")
|
||||||
if num == str(article_id):
|
if num == str(article_id):
|
||||||
return Response(
|
return Response(
|
||||||
FormexArticleConverter().convert_article(article),
|
FormexArticleConverter(language=language).convert_article(article),
|
||||||
media_type="text/html",
|
media_type="text/html",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user