Paragraph-level links, preview tooltips

This commit is contained in:
Adrian Rumpold
2025-04-30 12:04:38 +02:00
parent ea7885eeee
commit 7dd913df7b
25 changed files with 569 additions and 102 deletions

View File

@@ -2,7 +2,7 @@ import html
import re
import warnings
from dataclasses import dataclass
from typing import Optional, Union
from typing import Literal, Optional, Union
import lxml.etree
from lxml import etree as ET
@@ -29,9 +29,10 @@ def text_content(el: lxml.etree.Element) -> str:
@dataclass
class CrossReference:
id: str
target: Literal["article", "annex"]
text: str
target: str
id: str
paragraph: int | None = None
def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossReference]:
@@ -69,8 +70,8 @@ def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossRefer
# Also, match only at word boundaries to prevent partial matches
parts = PATTERN_PARTS[language]
patterns = {
"article": rf"\b{parts["article"]}\s+(?P<art_num>\d+){parts["exclusion"]}\b",
"annex": rf"\b{parts["annex"]}\s+(?P<annex_num>[DILMVX]+){parts["exclusion"]}\b",
"article": rf"\b{parts["article"]}\s+(?P<art_num>\d+)(?:[(](?P<parag_num>\d+)[)])?(?:{parts["exclusion"]})",
"annex": rf"\b{parts["annex"]}\s+(?P<annex_num>[DILMVX]+)(?:{parts["exclusion"]})",
}
for key, pattern in patterns.items():
matches = re.finditer(pattern, text, flags=re.IGNORECASE)
@@ -78,13 +79,54 @@ def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossRefer
crossref_id = (
match.group("art_num") if key == "article" else match.group("annex_num")
)
parag_num = match.groupdict().get("parag_num")
crossref_text = match.group(0)
crossrefs.append(
CrossReference(id=crossref_id, text=crossref_text, target=key)
CrossReference(
target=key,
id=crossref_id,
paragraph=parag_num,
text=crossref_text,
)
)
return crossrefs
def extract_article(doc: ET.ElementBase, article_id: int) -> ET.ElementBase | None:
"""Extract a specific article from a Formex document.
Args:
doc: The XML document to extract from.
article_id: The article number.
Returns:
The extracted article element.
"""
# Use XPath to find the specific article
xpath = f".//ARTICLE[@IDENTIFIER='{article_id:03d}']"
return doc.xpath(xpath)[0] if doc.xpath(xpath) else None
def extract_paragraph(
doc: ET.ElementBase, article_id: int, paragraph_id: int
) -> ET.ElementBase | None:
"""Extract a specific paragraph from an article in a Formex document.
Args:
doc: The XML document to extract from.
article_id: The article number.
paragraph_id: The paragraph number.
Returns:
The extracted paragraph element.
"""
# Use XPath to find the specific paragraph
xpath = f".//PARAG[@IDENTIFIER='{article_id:03d}.{paragraph_id:03d}']"
return doc.xpath(xpath)[0] if doc.xpath(xpath) else None
class FormexArticleConverter:
"""Converts Formex XML <ARTICLE> elements to semantic HTML5."""
@@ -136,7 +178,7 @@ class FormexArticleConverter:
# Replace the cross-reference text with a link
text = text.replace(
xref.text,
f'<a href="" data-target="{xref.target}" data-id="{xref.id}" class="cross-ref">{xref.text}</a>',
f'<a href="" data-target="{xref.target}" data-id="{xref.id}" data-paragraph-id="{xref.paragraph or ''}" class="cross-ref">{xref.text}</a>',
)
return text
@@ -418,10 +460,13 @@ class FormexArticleConverter:
article_subtitle = self._convert_btx(sti_art) if sti_art is not None else ""
# Build the header section
header = f'<header><h3 class="article-title">{article_title}</h3>'
if article_subtitle:
header += f'<h4 class="article-subtitle">{article_subtitle}</h4>'
header += "</header>"
if article_title and article_subtitle:
header = f'<header><h3 class="article-title">{article_title}</h3>'
if article_subtitle:
header += f'<h4 class="article-subtitle">{article_subtitle}</h4>'
header += "</header>"
else:
header = ""
# Process the content based on what's present
content = ""

View File

@@ -2,7 +2,11 @@ import lxml.etree as ET
from fastapi import APIRouter, FastAPI, Response
from fastapi.middleware.cors import CORSMiddleware
from formex_viewer.formex4 import FormexArticleConverter
from formex_viewer.formex4 import (
FormexArticleConverter,
extract_article,
extract_paragraph,
)
from formex_viewer.main import (
CellarClient,
CellarIdentifier,
@@ -121,21 +125,46 @@ def toc(celex_id: str, language: Language = Language.ENG):
@api_router.get("/{celex_id}/articles/{article_id}/{language}")
def article(celex_id: str, article_id: int, language: Language = Language.ENG):
def article(
celex_id: str,
article_id: int,
language: Language = Language.ENG,
):
"""
Fetch an article from the server.
"""
xml = _get_fmx4_data(celex_id, language)
article = extract_article(xml, article_id=article_id)
article_xpath = "//ARTICLE"
articles = xml.xpath(article_xpath)
for article in articles:
num = article.get("IDENTIFIER").lstrip("0")
if num == str(article_id):
return Response(
FormexArticleConverter(language=language).convert_article(article),
media_type="text/html",
)
if article is None:
return Response(
"Article not found",
status_code=404,
)
return Response(
FormexArticleConverter(language=language).convert_article(article),
media_type="text/html",
)
@api_router.get("/{celex_id}/articles/{article_id}/{parag_id}/{language}")
def paragraph(
celex_id: str,
article_id: int,
parag_id: int,
language: Language = Language.ENG,
):
"""
Fetch a paragraph within an article from the server.
"""
xml = _get_fmx4_data(celex_id, language)
parag = extract_paragraph(xml, article_id=article_id, paragraph_id=parag_id)
return Response(
FormexArticleConverter(language=language).convert_article(parag),
media_type="text/html",
)
app.include_router(api_router, prefix="/api")