Compare commits

...

5 Commits

Author SHA1 Message Date
Adrian Rumpold
58bd1160c1 fix: Improve rendering of TOC panel 2025-05-20 12:08:14 +02:00
Adrian Rumpold
debaf567ea feat: Add additional example legislation 2025-05-20 12:07:58 +02:00
Adrian Rumpold
56d271d0df fix: Correctly convert single paragraph in FastAPI 2025-05-20 09:14:47 +02:00
Adrian Rumpold
f0d4214d17 fix: Preserve XML tree order in Formex parser 2025-05-20 09:05:32 +02:00
Adrian Rumpold
56b5e3e3a4 fix: Type hints in Formex parser 2025-05-20 08:37:16 +02:00
9 changed files with 571 additions and 457 deletions

View File

@@ -12,7 +12,13 @@ describe("CelexSelector", () => {
expect(getByLabelText("Select example:")).toBeInTheDocument();
expect(getByRole("combobox")).toBeInTheDocument();
const options = getAllByRole("option");
const [def, ...options] = getAllByRole("option");
// First option is the disabled placeholder option
expect(def).toHaveValue("");
expect(def).toHaveTextContent("Select an example");
expect(def).toBeDisabled();
expect(options).toHaveLength(examples.length);
for (const i in examples) {
expect(options[i]).toHaveValue(examples[i].id);

View File

@@ -1,16 +1,15 @@
.toc {
font-size: 0.8rem;
min-width: 25vw;
flex: 1 auto;
flex: 1 0 25vw;
&.hidden {
flex: 0 0;
min-width: 0;
display: none;
}
transition: flex-basis 0.1s ease-in-out;
overflow-y: scroll;
overflow-x: wrap;
height: 100vh;
.tocDivision {
margin-block: 0.5rem;

View File

@@ -55,15 +55,17 @@ function TOC({ toc }: TOCProps) {
const [isVisible, setIsVisible] = useState(true);
return (
<nav className={[styles.toc, isVisible ? "" : styles.hidden].join(" ")}>
<>
<button
onClick={() => setIsVisible(!isVisible)}
className={styles.toggleButton}
>
{isVisible ? "<" : ">"}
</button>
{toc.map((division) => renderDivision(division))}
</nav>
<nav className={[styles.toc, isVisible ? "" : styles.hidden].join(" ")}>
{toc.map((division) => renderDivision(division))}
</nav>
</>
);
}
export default TOC;

View File

@@ -1,5 +1,9 @@
export const examples = [
{ name: "GDPR", id: "32016R0679" },
{ name: "AI Act", id: "32024R1689" },
{ name: "Cybersecurity Act", id: "32019R0881" },
{ name: "Cyber Resilience Act", id: "32024R2847" },
{ name: "Medical Device Regulation", id: "32017R0745" },
{ name: "NIS 2 Directive", id: "32022L2555" },
{ name: "Digital Services Act", id: "32022R2065" },
];

View File

@@ -20,3 +20,8 @@ formex-viewer = "formex_viewer:main"
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[dependency-groups]
dev = [
"pytest>=8.3.5",
]

View File

@@ -2,7 +2,7 @@ import html
import re
import warnings
from dataclasses import dataclass
from typing import Literal, Optional, Union
from typing import Literal, Optional, Union, cast
import lxml.etree
from lxml import etree as ET
@@ -10,7 +10,7 @@ from lxml import etree as ET
from formex_viewer.main import Language
def text_content(el: lxml.etree.Element) -> str:
def text_content(el: ET._Element) -> str:
"""Get the text content of an XML element, including all child elements."""
def _iterate(el):
@@ -35,7 +35,7 @@ class CrossReference:
paragraph: int | None = None
def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossReference]:
def extract_xrefs(el: ET._Element, language: Language) -> list[CrossReference]:
"""Extract cross-references from an XML element.
Args:
@@ -80,19 +80,23 @@ def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossRefer
match.group("art_num") if key == "article" else match.group("annex_num")
)
parag_num = match.groupdict().get("parag_num")
if key not in ["article", "annex"]:
raise RuntimeError()
crossref_text = match.group(0)
crossrefs.append(
CrossReference(
target=key,
id=crossref_id,
paragraph=parag_num,
paragraph=int(parag_num) if parag_num else None,
text=crossref_text,
)
)
return crossrefs
def extract_article(doc: ET.ElementBase, article_id: int) -> ET.ElementBase | None:
def extract_article(doc: ET._Element, article_id: int) -> ET._Element | None:
"""Extract a specific article from a Formex document.
Args:
@@ -109,8 +113,8 @@ def extract_article(doc: ET.ElementBase, article_id: int) -> ET.ElementBase | No
def extract_paragraph(
doc: ET.ElementBase, article_id: int, paragraph_id: int
) -> ET.ElementBase | None:
doc: ET._Element, article_id: int, paragraph_id: int
) -> ET._Element | None:
"""Extract a specific paragraph from an article in a Formex document.
Args:
@@ -146,7 +150,7 @@ class FormexArticleConverter:
"""Get the tag name with namespace if available."""
return f"{self.ns_prefix}{tag}"
def _get_text(self, element: ET.Element) -> str:
def _get_text(self, element: ET._Element) -> str:
"""Get the text content of an element, including all nested text.
This uses lxml's text_content() method when available, falling back to
@@ -161,7 +165,7 @@ class FormexArticleConverter:
except AttributeError:
# Fall back to manual traversal if text_content() is not available
text = element.text or ""
for child in element:
for child in element.iterchildren(tag="*"):
text += self._get_text(child)
if child.tail:
text += child.tail
@@ -182,7 +186,7 @@ class FormexArticleConverter:
)
return text
def _convert_btx(self, element: ET.Element) -> str:
def _convert_btx(self, element: ET._Element) -> str:
"""
Convert basic text elements (t_btx, t_btx.seq) to HTML.
@@ -202,7 +206,7 @@ class FormexArticleConverter:
# Replace the cross-reference text with a link
result = self._replace_xref(result, xref)
for child in element:
for child in element.iterchildren(tag="*"):
child_tag = child.tag.replace(self.ns_prefix, "")
# Process common inline elements
@@ -309,7 +313,7 @@ class FormexArticleConverter:
return result
def _convert_list(self, list_element: ET.Element) -> str:
def _convert_list(self, list_element: ET._Element) -> str:
"""Convert a Formex LIST element to HTML list items."""
result = ""
# Using lxml's xpath to get direct child ITEM elements
@@ -347,41 +351,40 @@ class FormexArticleConverter:
return result
def _convert_alinea(self, alinea: ET.Element) -> str:
def _convert_alinea(self, alinea: ET._Element) -> str:
"""Convert an ALINEA element to HTML."""
return f'<p class="alinea">{self._convert_btx(alinea)}</p>'
def _convert_parag(self, parag: ET.Element) -> str:
def _convert_parag(self, parag: ET._Element) -> str:
"""Convert a PARAG (paragraph) element to HTML."""
identifier = parag.get("IDENTIFIER", "")
parag_id = self._create_id(identifier) if identifier else ""
# Get the paragraph number using XPath
no_parag_elems = parag.xpath(f"./{self._get_tag('NO.PARAG')}")
parag_num = self._get_text(no_parag_elems[0]) if no_parag_elems else ""
# Process the alineas within the paragraph
content = ""
for alinea in parag.xpath(f"./{self._get_tag('ALINEA')}"):
content += self._convert_alinea(alinea)
for child in parag.iterchildren(tag="*"):
child_tag = child.tag.replace(self.ns_prefix, "")
if child_tag == "ALINEA":
content += self._convert_alinea(child)
elif child_tag == "COMMENT":
content += f'<div class="comment">{self._convert_btx(child)}</div>'
elif child_tag == "QUOT.S":
content += f'<blockquote class="quotation">{self._convert_btx(child)}</blockquote>'
elif child_tag == "NO.PARAG":
content += (
f'<span class="paragraph-number">{self._convert_btx(child)}</span>'
)
else:
raise RuntimeError(
f"Unexpected child element '{child_tag}' in PARAG: {text_content(child)}"
)
# Process any comments
for comment in parag.xpath(f"./{self._get_tag('COMMENT')}"):
content += f'<div class="comment">{self._convert_btx(comment)}</div>'
return f'<div class="paragraph" data-paragraph-id="{parag_id}">{content}</div>'
# Process any quotations
for quot in parag.xpath(f"./{self._get_tag('QUOT.S')}"):
content += (
f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>'
)
return f'<div class="paragraph" data-paragraph-id="{parag_id}"><span class="paragraph-number">{parag_num}</span>{content}</div>'
def _convert_subdiv(self, subdiv: ET.Element) -> str:
"""Convert a SUBDIV (subdivision) element to HTML."""
# Get the title using XPath
title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}")
def _convert_subdiv(self, subdiv: ET._Element) -> str:
"""Convert a SUBDIV (subdivision) element to HTML, preserving child order."""
# Get the title using XPath (should be the first TITLE child if present)
title = ""
title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}")
if title_elems:
title_elem = title_elems[0]
# Process TI (title) and STI (subtitle) elements
@@ -396,34 +399,30 @@ class FormexArticleConverter:
if sti_list:
title += f'<h5 class="subdivision-subtitle">{" ".join(sti_list)}</h5>'
# Process content: either paragraphs, alineas, or nested subdivisions
# Process all children in order, skipping TITLE (already handled)
content = ""
# Process paragraphs directly under this subdivision
for parag in subdiv.xpath(f"./{self._get_tag('PARAG')}"):
content += self._convert_parag(parag)
# Process alineas directly under this subdivision
for alinea in subdiv.xpath(f"./{self._get_tag('ALINEA')}"):
content += self._convert_alinea(alinea)
# Process comments directly under this subdivision
for comment in subdiv.xpath(f"./{self._get_tag('COMMENT')}"):
content += f'<div class="comment">{self._convert_btx(comment)}</div>'
# Process quotations directly under this subdivision
for quot in subdiv.xpath(f"./{self._get_tag('QUOT.S')}"):
content += (
f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>'
)
# Process nested subdivisions directly under this subdivision
for sub in subdiv.xpath(f"./{self._get_tag('SUBDIV')}"):
content += self._convert_subdiv(sub)
for child in subdiv.iterchildren(tag="*"):
child_tag = child.tag.replace(self.ns_prefix, "")
if child_tag == "TITLE":
continue # already handled
elif child_tag == "PARAG":
content += self._convert_parag(child)
elif child_tag == "ALINEA":
content += self._convert_alinea(child)
elif child_tag == "COMMENT":
content += f'<div class="comment">{self._convert_btx(child)}</div>'
elif child_tag == "QUOT.S":
content += f'<blockquote class="quotation">{self._convert_btx(child)}</blockquote>'
elif child_tag == "SUBDIV":
content += self._convert_subdiv(child)
else:
raise RuntimeError(
f"Unexpected child element '{child_tag}' in SUBDIV: {text_content(child)}"
)
return f'<section class="subdivision">{title}{content}</section>'
def convert_article(self, article: Union[str, ET.Element]) -> str:
def convert_article(self, article: Union[str, ET._Element]) -> str:
"""
Convert a Formex <ARTICLE> element to HTML5.
@@ -437,7 +436,9 @@ class FormexArticleConverter:
if isinstance(article, str):
try:
parser = ET.XMLParser(remove_blank_text=True)
article = ET.fromstring(article.encode("utf-8"), parser)
article = cast(
ET._Element, ET.fromstring(article.encode("utf-8"), parser)
)
except ET.XMLSyntaxError as e:
return f"<p>Error parsing XML: {e}</p>"
@@ -471,35 +472,25 @@ class FormexArticleConverter:
# Process the content based on what's present
content = ""
# Check if we have alineas directly under the article
alineas = article.xpath(f"./{self._get_tag('ALINEA')}")
if alineas:
for alinea in alineas:
content += self._convert_alinea(alinea)
# Check if we have paragraphs directly under the article
parags = article.xpath(f"./{self._get_tag('PARAG')}")
if parags:
for parag in parags:
content += self._convert_parag(parag)
# Check for comments directly under the article
comments = article.xpath(f"./{self._get_tag('COMMENT')}")
if comments:
for comment in comments:
content += f'<div class="comment">{self._convert_btx(comment)}</div>'
# Check for quotations directly under the article
quots = article.xpath(f"./{self._get_tag('QUOT.S')}")
if quots:
for quot in quots:
content += f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>'
# Check for subdivisions directly under the article
subdivs = article.xpath(f"./{self._get_tag('SUBDIV')}")
if subdivs:
for subdiv in subdivs:
content += self._convert_subdiv(subdiv)
# Process all child elements (except TITLE) in tree order
for child in article.iterchildren(tag="*"):
child_tag = child.tag.replace(self.ns_prefix, "")
if child_tag in ["TI.ART", "STI.ART"]:
continue # already handled
elif child_tag == "ALINEA":
content += self._convert_alinea(child)
elif child_tag == "PARAG":
content += self._convert_parag(child)
elif child_tag == "COMMENT":
content += f'<div class="comment">{self._convert_btx(child)}</div>'
elif child_tag == "QUOT.S":
content += f'<blockquote class="quotation">{self._convert_btx(child)}</blockquote>'
elif child_tag == "SUBDIV":
content += self._convert_subdiv(child)
else:
raise RuntimeError(
f"Unexpected child element '{child_tag}' in ARTICLE: {text_content(child)}"
)
# Assemble the complete article
return f'<article id="{article_id}" class="formex-article">{header}<div class="article-content">{content}</div></article>'

View File

@@ -160,9 +160,14 @@ def paragraph(
"""
xml = _get_fmx4_data(celex_id, language)
parag = extract_paragraph(xml, article_id=article_id, paragraph_id=parag_id)
if parag is None:
return Response(
"Paragraph not found",
status_code=404,
)
return Response(
FormexArticleConverter(language=language).convert_article(parag),
FormexArticleConverter(language=language)._convert_parag(parag),
media_type="text/html",
)

52
tests/test_parser.py Normal file
View File

@@ -0,0 +1,52 @@
import pytest
from lxml import etree as ET
from formex_viewer.formex4 import FormexArticleConverter
from formex_viewer.main import Language
@pytest.fixture
def converter():
return FormexArticleConverter(language=Language.ENG)
def test_convert_tree_order(converter):
"""Test that the order of HTML blocks in the converted article matches the order of elements in the XML tree."""
xml = """
<ARTICLE>
<SUBDIV>
<TITLE>
<TI>Subdivision Title</TI>
<STI>Subdivision Subtitle</STI>
</TITLE>
<PARAG IDENTIFIER="001.001">
<NO.PARAG>1</NO.PARAG>
<ALINEA>Paragraph 1 text.</ALINEA>
</PARAG>
<COMMENT>Comment text.</COMMENT>
<ALINEA>Alinea text.</ALINEA>
<QUOT.S>Quotation text.</QUOT.S>
<SUBDIV>
<TITLE>
<TI>Nested Subdivision</TI>
</TITLE>
<ALINEA>Nested alinea.</ALINEA>
</SUBDIV>
</SUBDIV>
</ARTICLE>
"""
parser = ET.XMLParser(remove_blank_text=True)
el = ET.fromstring(xml, parser)
html = converter.convert_article(el)
# Check that the order of HTML blocks matches the order of elements in the XML tree
idx_title = html.index("Subdivision Title")
idx_parag = html.index('class="paragraph"')
idx_comment = html.index("Comment text.")
idx_alinea = html.index("Alinea text.")
idx_quot = html.index("Quotation text.")
idx_nested = html.index("Nested Subdivision")
# The order in the XML: title, parag, alinea, comment, quot, nested subdiv
assert idx_title < idx_parag < idx_comment < idx_alinea < idx_quot < idx_nested

764
uv.lock generated

File diff suppressed because it is too large Load Diff