Compare commits

...

5 Commits

Author SHA1 Message Date
Adrian Rumpold
58bd1160c1 fix: Improve rendering of TOC panel 2025-05-20 12:08:14 +02:00
Adrian Rumpold
debaf567ea feat: Add additional example legislation 2025-05-20 12:07:58 +02:00
Adrian Rumpold
56d271d0df fix: Correctly convert single paragraph in FastAPI 2025-05-20 09:14:47 +02:00
Adrian Rumpold
f0d4214d17 fix: Preserve XML tree order in Formex parser 2025-05-20 09:05:32 +02:00
Adrian Rumpold
56b5e3e3a4 fix: Type hints in Formex parser 2025-05-20 08:37:16 +02:00
9 changed files with 571 additions and 457 deletions

View File

@@ -12,7 +12,13 @@ describe("CelexSelector", () => {
expect(getByLabelText("Select example:")).toBeInTheDocument(); expect(getByLabelText("Select example:")).toBeInTheDocument();
expect(getByRole("combobox")).toBeInTheDocument(); expect(getByRole("combobox")).toBeInTheDocument();
const options = getAllByRole("option"); const [def, ...options] = getAllByRole("option");
// First option is the disabled placeholder option
expect(def).toHaveValue("");
expect(def).toHaveTextContent("Select an example");
expect(def).toBeDisabled();
expect(options).toHaveLength(examples.length); expect(options).toHaveLength(examples.length);
for (const i in examples) { for (const i in examples) {
expect(options[i]).toHaveValue(examples[i].id); expect(options[i]).toHaveValue(examples[i].id);

View File

@@ -1,16 +1,15 @@
.toc { .toc {
font-size: 0.8rem; font-size: 0.8rem;
min-width: 25vw; flex: 1 0 25vw;
flex: 1 auto;
&.hidden { &.hidden {
flex: 0 0; display: none;
min-width: 0;
} }
transition: flex-basis 0.1s ease-in-out;
overflow-y: scroll; overflow-y: scroll;
overflow-x: wrap; overflow-x: wrap;
height: 100vh;
.tocDivision { .tocDivision {
margin-block: 0.5rem; margin-block: 0.5rem;

View File

@@ -55,15 +55,17 @@ function TOC({ toc }: TOCProps) {
const [isVisible, setIsVisible] = useState(true); const [isVisible, setIsVisible] = useState(true);
return ( return (
<nav className={[styles.toc, isVisible ? "" : styles.hidden].join(" ")}> <>
<button <button
onClick={() => setIsVisible(!isVisible)} onClick={() => setIsVisible(!isVisible)}
className={styles.toggleButton} className={styles.toggleButton}
> >
{isVisible ? "<" : ">"} {isVisible ? "<" : ">"}
</button> </button>
<nav className={[styles.toc, isVisible ? "" : styles.hidden].join(" ")}>
{toc.map((division) => renderDivision(division))} {toc.map((division) => renderDivision(division))}
</nav> </nav>
</>
); );
} }
export default TOC; export default TOC;

View File

@@ -1,5 +1,9 @@
export const examples = [ export const examples = [
{ name: "GDPR", id: "32016R0679" }, { name: "GDPR", id: "32016R0679" },
{ name: "AI Act", id: "32024R1689" }, { name: "AI Act", id: "32024R1689" },
{ name: "Cybersecurity Act", id: "32019R0881" },
{ name: "Cyber Resilience Act", id: "32024R2847" }, { name: "Cyber Resilience Act", id: "32024R2847" },
{ name: "Medical Device Regulation", id: "32017R0745" },
{ name: "NIS 2 Directive", id: "32022L2555" },
{ name: "Digital Services Act", id: "32022R2065" },
]; ];

View File

@@ -20,3 +20,8 @@ formex-viewer = "formex_viewer:main"
[build-system] [build-system]
requires = ["hatchling"] requires = ["hatchling"]
build-backend = "hatchling.build" build-backend = "hatchling.build"
[dependency-groups]
dev = [
"pytest>=8.3.5",
]

View File

@@ -2,7 +2,7 @@ import html
import re import re
import warnings import warnings
from dataclasses import dataclass from dataclasses import dataclass
from typing import Literal, Optional, Union from typing import Literal, Optional, Union, cast
import lxml.etree import lxml.etree
from lxml import etree as ET from lxml import etree as ET
@@ -10,7 +10,7 @@ from lxml import etree as ET
from formex_viewer.main import Language from formex_viewer.main import Language
def text_content(el: lxml.etree.Element) -> str: def text_content(el: ET._Element) -> str:
"""Get the text content of an XML element, including all child elements.""" """Get the text content of an XML element, including all child elements."""
def _iterate(el): def _iterate(el):
@@ -35,7 +35,7 @@ class CrossReference:
paragraph: int | None = None paragraph: int | None = None
def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossReference]: def extract_xrefs(el: ET._Element, language: Language) -> list[CrossReference]:
"""Extract cross-references from an XML element. """Extract cross-references from an XML element.
Args: Args:
@@ -80,19 +80,23 @@ def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossRefer
match.group("art_num") if key == "article" else match.group("annex_num") match.group("art_num") if key == "article" else match.group("annex_num")
) )
parag_num = match.groupdict().get("parag_num") parag_num = match.groupdict().get("parag_num")
if key not in ["article", "annex"]:
raise RuntimeError()
crossref_text = match.group(0) crossref_text = match.group(0)
crossrefs.append( crossrefs.append(
CrossReference( CrossReference(
target=key, target=key,
id=crossref_id, id=crossref_id,
paragraph=parag_num, paragraph=int(parag_num) if parag_num else None,
text=crossref_text, text=crossref_text,
) )
) )
return crossrefs return crossrefs
def extract_article(doc: ET.ElementBase, article_id: int) -> ET.ElementBase | None: def extract_article(doc: ET._Element, article_id: int) -> ET._Element | None:
"""Extract a specific article from a Formex document. """Extract a specific article from a Formex document.
Args: Args:
@@ -109,8 +113,8 @@ def extract_article(doc: ET.ElementBase, article_id: int) -> ET.ElementBase | No
def extract_paragraph( def extract_paragraph(
doc: ET.ElementBase, article_id: int, paragraph_id: int doc: ET._Element, article_id: int, paragraph_id: int
) -> ET.ElementBase | None: ) -> ET._Element | None:
"""Extract a specific paragraph from an article in a Formex document. """Extract a specific paragraph from an article in a Formex document.
Args: Args:
@@ -146,7 +150,7 @@ class FormexArticleConverter:
"""Get the tag name with namespace if available.""" """Get the tag name with namespace if available."""
return f"{self.ns_prefix}{tag}" return f"{self.ns_prefix}{tag}"
def _get_text(self, element: ET.Element) -> str: def _get_text(self, element: ET._Element) -> str:
"""Get the text content of an element, including all nested text. """Get the text content of an element, including all nested text.
This uses lxml's text_content() method when available, falling back to This uses lxml's text_content() method when available, falling back to
@@ -161,7 +165,7 @@ class FormexArticleConverter:
except AttributeError: except AttributeError:
# Fall back to manual traversal if text_content() is not available # Fall back to manual traversal if text_content() is not available
text = element.text or "" text = element.text or ""
for child in element: for child in element.iterchildren(tag="*"):
text += self._get_text(child) text += self._get_text(child)
if child.tail: if child.tail:
text += child.tail text += child.tail
@@ -182,7 +186,7 @@ class FormexArticleConverter:
) )
return text return text
def _convert_btx(self, element: ET.Element) -> str: def _convert_btx(self, element: ET._Element) -> str:
""" """
Convert basic text elements (t_btx, t_btx.seq) to HTML. Convert basic text elements (t_btx, t_btx.seq) to HTML.
@@ -202,7 +206,7 @@ class FormexArticleConverter:
# Replace the cross-reference text with a link # Replace the cross-reference text with a link
result = self._replace_xref(result, xref) result = self._replace_xref(result, xref)
for child in element: for child in element.iterchildren(tag="*"):
child_tag = child.tag.replace(self.ns_prefix, "") child_tag = child.tag.replace(self.ns_prefix, "")
# Process common inline elements # Process common inline elements
@@ -309,7 +313,7 @@ class FormexArticleConverter:
return result return result
def _convert_list(self, list_element: ET.Element) -> str: def _convert_list(self, list_element: ET._Element) -> str:
"""Convert a Formex LIST element to HTML list items.""" """Convert a Formex LIST element to HTML list items."""
result = "" result = ""
# Using lxml's xpath to get direct child ITEM elements # Using lxml's xpath to get direct child ITEM elements
@@ -347,41 +351,40 @@ class FormexArticleConverter:
return result return result
def _convert_alinea(self, alinea: ET.Element) -> str: def _convert_alinea(self, alinea: ET._Element) -> str:
"""Convert an ALINEA element to HTML.""" """Convert an ALINEA element to HTML."""
return f'<p class="alinea">{self._convert_btx(alinea)}</p>' return f'<p class="alinea">{self._convert_btx(alinea)}</p>'
def _convert_parag(self, parag: ET.Element) -> str: def _convert_parag(self, parag: ET._Element) -> str:
"""Convert a PARAG (paragraph) element to HTML.""" """Convert a PARAG (paragraph) element to HTML."""
identifier = parag.get("IDENTIFIER", "") identifier = parag.get("IDENTIFIER", "")
parag_id = self._create_id(identifier) if identifier else "" parag_id = self._create_id(identifier) if identifier else ""
# Get the paragraph number using XPath
no_parag_elems = parag.xpath(f"./{self._get_tag('NO.PARAG')}")
parag_num = self._get_text(no_parag_elems[0]) if no_parag_elems else ""
# Process the alineas within the paragraph
content = "" content = ""
for alinea in parag.xpath(f"./{self._get_tag('ALINEA')}"): for child in parag.iterchildren(tag="*"):
content += self._convert_alinea(alinea) child_tag = child.tag.replace(self.ns_prefix, "")
if child_tag == "ALINEA":
# Process any comments content += self._convert_alinea(child)
for comment in parag.xpath(f"./{self._get_tag('COMMENT')}"): elif child_tag == "COMMENT":
content += f'<div class="comment">{self._convert_btx(comment)}</div>' content += f'<div class="comment">{self._convert_btx(child)}</div>'
elif child_tag == "QUOT.S":
# Process any quotations content += f'<blockquote class="quotation">{self._convert_btx(child)}</blockquote>'
for quot in parag.xpath(f"./{self._get_tag('QUOT.S')}"): elif child_tag == "NO.PARAG":
content += ( content += (
f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>' f'<span class="paragraph-number">{self._convert_btx(child)}</span>'
)
else:
raise RuntimeError(
f"Unexpected child element '{child_tag}' in PARAG: {text_content(child)}"
) )
return f'<div class="paragraph" data-paragraph-id="{parag_id}"><span class="paragraph-number">{parag_num}</span>{content}</div>' return f'<div class="paragraph" data-paragraph-id="{parag_id}">{content}</div>'
def _convert_subdiv(self, subdiv: ET.Element) -> str: def _convert_subdiv(self, subdiv: ET._Element) -> str:
"""Convert a SUBDIV (subdivision) element to HTML.""" """Convert a SUBDIV (subdivision) element to HTML, preserving child order."""
# Get the title using XPath # Get the title using XPath (should be the first TITLE child if present)
title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}")
title = "" title = ""
title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}")
if title_elems: if title_elems:
title_elem = title_elems[0] title_elem = title_elems[0]
# Process TI (title) and STI (subtitle) elements # Process TI (title) and STI (subtitle) elements
@@ -396,34 +399,30 @@ class FormexArticleConverter:
if sti_list: if sti_list:
title += f'<h5 class="subdivision-subtitle">{" ".join(sti_list)}</h5>' title += f'<h5 class="subdivision-subtitle">{" ".join(sti_list)}</h5>'
# Process content: either paragraphs, alineas, or nested subdivisions # Process all children in order, skipping TITLE (already handled)
content = "" content = ""
for child in subdiv.iterchildren(tag="*"):
# Process paragraphs directly under this subdivision child_tag = child.tag.replace(self.ns_prefix, "")
for parag in subdiv.xpath(f"./{self._get_tag('PARAG')}"): if child_tag == "TITLE":
content += self._convert_parag(parag) continue # already handled
elif child_tag == "PARAG":
# Process alineas directly under this subdivision content += self._convert_parag(child)
for alinea in subdiv.xpath(f"./{self._get_tag('ALINEA')}"): elif child_tag == "ALINEA":
content += self._convert_alinea(alinea) content += self._convert_alinea(child)
elif child_tag == "COMMENT":
# Process comments directly under this subdivision content += f'<div class="comment">{self._convert_btx(child)}</div>'
for comment in subdiv.xpath(f"./{self._get_tag('COMMENT')}"): elif child_tag == "QUOT.S":
content += f'<div class="comment">{self._convert_btx(comment)}</div>' content += f'<blockquote class="quotation">{self._convert_btx(child)}</blockquote>'
elif child_tag == "SUBDIV":
# Process quotations directly under this subdivision content += self._convert_subdiv(child)
for quot in subdiv.xpath(f"./{self._get_tag('QUOT.S')}"): else:
content += ( raise RuntimeError(
f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>' f"Unexpected child element '{child_tag}' in SUBDIV: {text_content(child)}"
) )
# Process nested subdivisions directly under this subdivision
for sub in subdiv.xpath(f"./{self._get_tag('SUBDIV')}"):
content += self._convert_subdiv(sub)
return f'<section class="subdivision">{title}{content}</section>' return f'<section class="subdivision">{title}{content}</section>'
def convert_article(self, article: Union[str, ET.Element]) -> str: def convert_article(self, article: Union[str, ET._Element]) -> str:
""" """
Convert a Formex <ARTICLE> element to HTML5. Convert a Formex <ARTICLE> element to HTML5.
@@ -437,7 +436,9 @@ class FormexArticleConverter:
if isinstance(article, str): if isinstance(article, str):
try: try:
parser = ET.XMLParser(remove_blank_text=True) parser = ET.XMLParser(remove_blank_text=True)
article = ET.fromstring(article.encode("utf-8"), parser) article = cast(
ET._Element, ET.fromstring(article.encode("utf-8"), parser)
)
except ET.XMLSyntaxError as e: except ET.XMLSyntaxError as e:
return f"<p>Error parsing XML: {e}</p>" return f"<p>Error parsing XML: {e}</p>"
@@ -471,35 +472,25 @@ class FormexArticleConverter:
# Process the content based on what's present # Process the content based on what's present
content = "" content = ""
# Check if we have alineas directly under the article # Process all child elements (except TITLE) in tree order
alineas = article.xpath(f"./{self._get_tag('ALINEA')}") for child in article.iterchildren(tag="*"):
if alineas: child_tag = child.tag.replace(self.ns_prefix, "")
for alinea in alineas: if child_tag in ["TI.ART", "STI.ART"]:
content += self._convert_alinea(alinea) continue # already handled
elif child_tag == "ALINEA":
# Check if we have paragraphs directly under the article content += self._convert_alinea(child)
parags = article.xpath(f"./{self._get_tag('PARAG')}") elif child_tag == "PARAG":
if parags: content += self._convert_parag(child)
for parag in parags: elif child_tag == "COMMENT":
content += self._convert_parag(parag) content += f'<div class="comment">{self._convert_btx(child)}</div>'
elif child_tag == "QUOT.S":
# Check for comments directly under the article content += f'<blockquote class="quotation">{self._convert_btx(child)}</blockquote>'
comments = article.xpath(f"./{self._get_tag('COMMENT')}") elif child_tag == "SUBDIV":
if comments: content += self._convert_subdiv(child)
for comment in comments: else:
content += f'<div class="comment">{self._convert_btx(comment)}</div>' raise RuntimeError(
f"Unexpected child element '{child_tag}' in ARTICLE: {text_content(child)}"
# Check for quotations directly under the article )
quots = article.xpath(f"./{self._get_tag('QUOT.S')}")
if quots:
for quot in quots:
content += f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>'
# Check for subdivisions directly under the article
subdivs = article.xpath(f"./{self._get_tag('SUBDIV')}")
if subdivs:
for subdiv in subdivs:
content += self._convert_subdiv(subdiv)
# Assemble the complete article # Assemble the complete article
return f'<article id="{article_id}" class="formex-article">{header}<div class="article-content">{content}</div></article>' return f'<article id="{article_id}" class="formex-article">{header}<div class="article-content">{content}</div></article>'

View File

@@ -160,9 +160,14 @@ def paragraph(
""" """
xml = _get_fmx4_data(celex_id, language) xml = _get_fmx4_data(celex_id, language)
parag = extract_paragraph(xml, article_id=article_id, paragraph_id=parag_id) parag = extract_paragraph(xml, article_id=article_id, paragraph_id=parag_id)
if parag is None:
return Response(
"Paragraph not found",
status_code=404,
)
return Response( return Response(
FormexArticleConverter(language=language).convert_article(parag), FormexArticleConverter(language=language)._convert_parag(parag),
media_type="text/html", media_type="text/html",
) )

52
tests/test_parser.py Normal file
View File

@@ -0,0 +1,52 @@
import pytest
from lxml import etree as ET
from formex_viewer.formex4 import FormexArticleConverter
from formex_viewer.main import Language
@pytest.fixture
def converter():
return FormexArticleConverter(language=Language.ENG)
def test_convert_tree_order(converter):
"""Test that the order of HTML blocks in the converted article matches the order of elements in the XML tree."""
xml = """
<ARTICLE>
<SUBDIV>
<TITLE>
<TI>Subdivision Title</TI>
<STI>Subdivision Subtitle</STI>
</TITLE>
<PARAG IDENTIFIER="001.001">
<NO.PARAG>1</NO.PARAG>
<ALINEA>Paragraph 1 text.</ALINEA>
</PARAG>
<COMMENT>Comment text.</COMMENT>
<ALINEA>Alinea text.</ALINEA>
<QUOT.S>Quotation text.</QUOT.S>
<SUBDIV>
<TITLE>
<TI>Nested Subdivision</TI>
</TITLE>
<ALINEA>Nested alinea.</ALINEA>
</SUBDIV>
</SUBDIV>
</ARTICLE>
"""
parser = ET.XMLParser(remove_blank_text=True)
el = ET.fromstring(xml, parser)
html = converter.convert_article(el)
# Check that the order of HTML blocks matches the order of elements in the XML tree
idx_title = html.index("Subdivision Title")
idx_parag = html.index('class="paragraph"')
idx_comment = html.index("Comment text.")
idx_alinea = html.index("Alinea text.")
idx_quot = html.index("Quotation text.")
idx_nested = html.index("Nested Subdivision")
# The order in the XML: title, parag, alinea, comment, quot, nested subdiv
assert idx_title < idx_parag < idx_comment < idx_alinea < idx_quot < idx_nested

764
uv.lock generated

File diff suppressed because it is too large Load Diff