fix: Preserve XML tree order in Formex parser

This commit is contained in:
Adrian Rumpold
2025-05-20 09:05:32 +02:00
parent 56b5e3e3a4
commit f0d4214d17
4 changed files with 524 additions and 432 deletions

View File

@@ -20,3 +20,8 @@ formex-viewer = "formex_viewer:main"
[build-system] [build-system]
requires = ["hatchling"] requires = ["hatchling"]
build-backend = "hatchling.build" build-backend = "hatchling.build"
[dependency-groups]
dev = [
"pytest>=8.3.5",
]

View File

@@ -81,7 +81,7 @@ def extract_xrefs(el: ET._Element, language: Language) -> list[CrossReference]:
) )
parag_num = match.groupdict().get("parag_num") parag_num = match.groupdict().get("parag_num")
if not parag_num or key not in ["article", "annex"]: if key not in ["article", "annex"]:
raise RuntimeError() raise RuntimeError()
crossref_text = match.group(0) crossref_text = match.group(0)
@@ -89,7 +89,7 @@ def extract_xrefs(el: ET._Element, language: Language) -> list[CrossReference]:
CrossReference( CrossReference(
target=key, target=key,
id=crossref_id, id=crossref_id,
paragraph=int(parag_num), paragraph=int(parag_num) if parag_num else None,
text=crossref_text, text=crossref_text,
) )
) )
@@ -360,32 +360,31 @@ class FormexArticleConverter:
identifier = parag.get("IDENTIFIER", "") identifier = parag.get("IDENTIFIER", "")
parag_id = self._create_id(identifier) if identifier else "" parag_id = self._create_id(identifier) if identifier else ""
# Get the paragraph number using XPath
no_parag_elems = parag.xpath(f"./{self._get_tag('NO.PARAG')}")
parag_num = self._get_text(no_parag_elems[0]) if no_parag_elems else ""
# Process the alineas within the paragraph
content = "" content = ""
for alinea in parag.xpath(f"./{self._get_tag('ALINEA')}"): for child in parag.iterchildren(tag="*"):
content += self._convert_alinea(alinea) child_tag = child.tag.replace(self.ns_prefix, "")
if child_tag == "ALINEA":
content += self._convert_alinea(child)
elif child_tag == "COMMENT":
content += f'<div class="comment">{self._convert_btx(child)}</div>'
elif child_tag == "QUOT.S":
content += f'<blockquote class="quotation">{self._convert_btx(child)}</blockquote>'
elif child_tag == "NO.PARAG":
content += (
f'<span class="paragraph-number">{self._convert_btx(child)}</span>'
)
else:
raise RuntimeError(
f"Unexpected child element '{child_tag}' in PARAG: {text_content(child)}"
)
# Process any comments return f'<div class="paragraph" data-paragraph-id="{parag_id}">{content}</div>'
for comment in parag.xpath(f"./{self._get_tag('COMMENT')}"):
content += f'<div class="comment">{self._convert_btx(comment)}</div>'
# Process any quotations
for quot in parag.xpath(f"./{self._get_tag('QUOT.S')}"):
content += (
f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>'
)
return f'<div class="paragraph" data-paragraph-id="{parag_id}"><span class="paragraph-number">{parag_num}</span>{content}</div>'
def _convert_subdiv(self, subdiv: ET._Element) -> str: def _convert_subdiv(self, subdiv: ET._Element) -> str:
"""Convert a SUBDIV (subdivision) element to HTML.""" """Convert a SUBDIV (subdivision) element to HTML, preserving child order."""
# Get the title using XPath # Get the title using XPath (should be the first TITLE child if present)
title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}")
title = "" title = ""
title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}")
if title_elems: if title_elems:
title_elem = title_elems[0] title_elem = title_elems[0]
# Process TI (title) and STI (subtitle) elements # Process TI (title) and STI (subtitle) elements
@@ -400,30 +399,26 @@ class FormexArticleConverter:
if sti_list: if sti_list:
title += f'<h5 class="subdivision-subtitle">{" ".join(sti_list)}</h5>' title += f'<h5 class="subdivision-subtitle">{" ".join(sti_list)}</h5>'
# Process content: either paragraphs, alineas, or nested subdivisions # Process all children in order, skipping TITLE (already handled)
content = "" content = ""
for child in subdiv.iterchildren(tag="*"):
# Process paragraphs directly under this subdivision child_tag = child.tag.replace(self.ns_prefix, "")
for parag in subdiv.xpath(f"./{self._get_tag('PARAG')}"): if child_tag == "TITLE":
content += self._convert_parag(parag) continue # already handled
elif child_tag == "PARAG":
# Process alineas directly under this subdivision content += self._convert_parag(child)
for alinea in subdiv.xpath(f"./{self._get_tag('ALINEA')}"): elif child_tag == "ALINEA":
content += self._convert_alinea(alinea) content += self._convert_alinea(child)
elif child_tag == "COMMENT":
# Process comments directly under this subdivision content += f'<div class="comment">{self._convert_btx(child)}</div>'
for comment in subdiv.xpath(f"./{self._get_tag('COMMENT')}"): elif child_tag == "QUOT.S":
content += f'<div class="comment">{self._convert_btx(comment)}</div>' content += f'<blockquote class="quotation">{self._convert_btx(child)}</blockquote>'
elif child_tag == "SUBDIV":
# Process quotations directly under this subdivision content += self._convert_subdiv(child)
for quot in subdiv.xpath(f"./{self._get_tag('QUOT.S')}"): else:
content += ( raise RuntimeError(
f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>' f"Unexpected child element '{child_tag}' in SUBDIV: {text_content(child)}"
) )
# Process nested subdivisions directly under this subdivision
for sub in subdiv.xpath(f"./{self._get_tag('SUBDIV')}"):
content += self._convert_subdiv(sub)
return f'<section class="subdivision">{title}{content}</section>' return f'<section class="subdivision">{title}{content}</section>'
@@ -477,35 +472,25 @@ class FormexArticleConverter:
# Process the content based on what's present # Process the content based on what's present
content = "" content = ""
# Check if we have alineas directly under the article # Process all child elements (except TITLE) in tree order
alineas = article.xpath(f"./{self._get_tag('ALINEA')}") for child in article.iterchildren(tag="*"):
if alineas: child_tag = child.tag.replace(self.ns_prefix, "")
for alinea in alineas: if child_tag in ["TI.ART", "STI.ART"]:
content += self._convert_alinea(alinea) continue # already handled
elif child_tag == "ALINEA":
# Check if we have paragraphs directly under the article content += self._convert_alinea(child)
parags = article.xpath(f"./{self._get_tag('PARAG')}") elif child_tag == "PARAG":
if parags: content += self._convert_parag(child)
for parag in parags: elif child_tag == "COMMENT":
content += self._convert_parag(parag) content += f'<div class="comment">{self._convert_btx(child)}</div>'
elif child_tag == "QUOT.S":
# Check for comments directly under the article content += f'<blockquote class="quotation">{self._convert_btx(child)}</blockquote>'
comments = article.xpath(f"./{self._get_tag('COMMENT')}") elif child_tag == "SUBDIV":
if comments: content += self._convert_subdiv(child)
for comment in comments: else:
content += f'<div class="comment">{self._convert_btx(comment)}</div>' raise RuntimeError(
f"Unexpected child element '{child_tag}' in ARTICLE: {text_content(child)}"
# Check for quotations directly under the article )
quots = article.xpath(f"./{self._get_tag('QUOT.S')}")
if quots:
for quot in quots:
content += f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>'
# Check for subdivisions directly under the article
subdivs = article.xpath(f"./{self._get_tag('SUBDIV')}")
if subdivs:
for subdiv in subdivs:
content += self._convert_subdiv(subdiv)
# Assemble the complete article # Assemble the complete article
return f'<article id="{article_id}" class="formex-article">{header}<div class="article-content">{content}</div></article>' return f'<article id="{article_id}" class="formex-article">{header}<div class="article-content">{content}</div></article>'

52
tests/test_parser.py Normal file
View File

@@ -0,0 +1,52 @@
import pytest
from lxml import etree as ET
from formex_viewer.formex4 import FormexArticleConverter
from formex_viewer.main import Language
@pytest.fixture
def converter():
return FormexArticleConverter(language=Language.ENG)
def test_convert_tree_order(converter):
"""Test that the order of HTML blocks in the converted article matches the order of elements in the XML tree."""
xml = """
<ARTICLE>
<SUBDIV>
<TITLE>
<TI>Subdivision Title</TI>
<STI>Subdivision Subtitle</STI>
</TITLE>
<PARAG IDENTIFIER="001.001">
<NO.PARAG>1</NO.PARAG>
<ALINEA>Paragraph 1 text.</ALINEA>
</PARAG>
<COMMENT>Comment text.</COMMENT>
<ALINEA>Alinea text.</ALINEA>
<QUOT.S>Quotation text.</QUOT.S>
<SUBDIV>
<TITLE>
<TI>Nested Subdivision</TI>
</TITLE>
<ALINEA>Nested alinea.</ALINEA>
</SUBDIV>
</SUBDIV>
</ARTICLE>
"""
parser = ET.XMLParser(remove_blank_text=True)
el = ET.fromstring(xml, parser)
html = converter.convert_article(el)
# Check that the order of HTML blocks matches the order of elements in the XML tree
idx_title = html.index("Subdivision Title")
idx_parag = html.index('class="paragraph"')
idx_comment = html.index("Comment text.")
idx_alinea = html.index("Alinea text.")
idx_quot = html.index("Quotation text.")
idx_nested = html.index("Nested Subdivision")
# The order in the XML: title, parag, alinea, comment, quot, nested subdiv
assert idx_title < idx_parag < idx_comment < idx_alinea < idx_quot < idx_nested

764
uv.lock generated

File diff suppressed because it is too large Load Diff