Paragraph-level links, preview tooltips

This commit is contained in:
Adrian Rumpold
2025-04-30 12:04:38 +02:00
parent ea7885eeee
commit 7dd913df7b
25 changed files with 569 additions and 102 deletions

View File

@@ -2,7 +2,7 @@ import html
import re
import warnings
from dataclasses import dataclass
from typing import Optional, Union
from typing import Literal, Optional, Union
import lxml.etree
from lxml import etree as ET
@@ -29,9 +29,10 @@ def text_content(el: lxml.etree.Element) -> str:
@dataclass
class CrossReference:
id: str
target: Literal["article", "annex"]
text: str
target: str
id: str
paragraph: int | None = None
def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossReference]:
@@ -69,8 +70,8 @@ def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossRefer
# Also, match only at word boundaries to prevent partial matches
parts = PATTERN_PARTS[language]
patterns = {
"article": rf"\b{parts["article"]}\s+(?P<art_num>\d+){parts["exclusion"]}\b",
"annex": rf"\b{parts["annex"]}\s+(?P<annex_num>[DILMVX]+){parts["exclusion"]}\b",
"article": rf"\b{parts["article"]}\s+(?P<art_num>\d+)(?:[(](?P<parag_num>\d+)[)])?(?:{parts["exclusion"]})",
"annex": rf"\b{parts["annex"]}\s+(?P<annex_num>[DILMVX]+)(?:{parts["exclusion"]})",
}
for key, pattern in patterns.items():
matches = re.finditer(pattern, text, flags=re.IGNORECASE)
@@ -78,13 +79,54 @@ def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossRefer
crossref_id = (
match.group("art_num") if key == "article" else match.group("annex_num")
)
parag_num = match.groupdict().get("parag_num")
crossref_text = match.group(0)
crossrefs.append(
CrossReference(id=crossref_id, text=crossref_text, target=key)
CrossReference(
target=key,
id=crossref_id,
paragraph=parag_num,
text=crossref_text,
)
)
return crossrefs
def extract_article(doc: ET.ElementBase, article_id: int) -> ET.ElementBase | None:
"""Extract a specific article from a Formex document.
Args:
doc: The XML document to extract from.
article_id: The article number.
Returns:
The extracted article element.
"""
# Use XPath to find the specific article
xpath = f".//ARTICLE[@IDENTIFIER='{article_id:03d}']"
return doc.xpath(xpath)[0] if doc.xpath(xpath) else None
def extract_paragraph(
doc: ET.ElementBase, article_id: int, paragraph_id: int
) -> ET.ElementBase | None:
"""Extract a specific paragraph from an article in a Formex document.
Args:
doc: The XML document to extract from.
article_id: The article number.
paragraph_id: The paragraph number.
Returns:
The extracted paragraph element.
"""
# Use XPath to find the specific paragraph
xpath = f".//PARAG[@IDENTIFIER='{article_id:03d}.{paragraph_id:03d}']"
return doc.xpath(xpath)[0] if doc.xpath(xpath) else None
class FormexArticleConverter:
"""Converts Formex XML <ARTICLE> elements to semantic HTML5."""
@@ -136,7 +178,7 @@ class FormexArticleConverter:
# Replace the cross-reference text with a link
text = text.replace(
xref.text,
f'<a href="" data-target="{xref.target}" data-id="{xref.id}" class="cross-ref">{xref.text}</a>',
f'<a href="" data-target="{xref.target}" data-id="{xref.id}" data-paragraph-id="{xref.paragraph or ''}" class="cross-ref">{xref.text}</a>',
)
return text
@@ -418,10 +460,13 @@ class FormexArticleConverter:
article_subtitle = self._convert_btx(sti_art) if sti_art is not None else ""
# Build the header section
header = f'<header><h3 class="article-title">{article_title}</h3>'
if article_subtitle:
header += f'<h4 class="article-subtitle">{article_subtitle}</h4>'
header += "</header>"
if article_title and article_subtitle:
header = f'<header><h3 class="article-title">{article_title}</h3>'
if article_subtitle:
header += f'<h4 class="article-subtitle">{article_subtitle}</h4>'
header += "</header>"
else:
header = ""
# Process the content based on what's present
content = ""