fix: Preserve XML tree order in Formex parser
This commit is contained in:
@@ -81,7 +81,7 @@ def extract_xrefs(el: ET._Element, language: Language) -> list[CrossReference]:
|
||||
)
|
||||
parag_num = match.groupdict().get("parag_num")
|
||||
|
||||
if not parag_num or key not in ["article", "annex"]:
|
||||
if key not in ["article", "annex"]:
|
||||
raise RuntimeError()
|
||||
|
||||
crossref_text = match.group(0)
|
||||
@@ -89,7 +89,7 @@ def extract_xrefs(el: ET._Element, language: Language) -> list[CrossReference]:
|
||||
CrossReference(
|
||||
target=key,
|
||||
id=crossref_id,
|
||||
paragraph=int(parag_num),
|
||||
paragraph=int(parag_num) if parag_num else None,
|
||||
text=crossref_text,
|
||||
)
|
||||
)
|
||||
@@ -360,32 +360,31 @@ class FormexArticleConverter:
|
||||
identifier = parag.get("IDENTIFIER", "")
|
||||
parag_id = self._create_id(identifier) if identifier else ""
|
||||
|
||||
# Get the paragraph number using XPath
|
||||
no_parag_elems = parag.xpath(f"./{self._get_tag('NO.PARAG')}")
|
||||
parag_num = self._get_text(no_parag_elems[0]) if no_parag_elems else ""
|
||||
|
||||
# Process the alineas within the paragraph
|
||||
content = ""
|
||||
for alinea in parag.xpath(f"./{self._get_tag('ALINEA')}"):
|
||||
content += self._convert_alinea(alinea)
|
||||
for child in parag.iterchildren(tag="*"):
|
||||
child_tag = child.tag.replace(self.ns_prefix, "")
|
||||
if child_tag == "ALINEA":
|
||||
content += self._convert_alinea(child)
|
||||
elif child_tag == "COMMENT":
|
||||
content += f'<div class="comment">{self._convert_btx(child)}</div>'
|
||||
elif child_tag == "QUOT.S":
|
||||
content += f'<blockquote class="quotation">{self._convert_btx(child)}</blockquote>'
|
||||
elif child_tag == "NO.PARAG":
|
||||
content += (
|
||||
f'<span class="paragraph-number">{self._convert_btx(child)}</span>'
|
||||
)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Unexpected child element '{child_tag}' in PARAG: {text_content(child)}"
|
||||
)
|
||||
|
||||
# Process any comments
|
||||
for comment in parag.xpath(f"./{self._get_tag('COMMENT')}"):
|
||||
content += f'<div class="comment">{self._convert_btx(comment)}</div>'
|
||||
|
||||
# Process any quotations
|
||||
for quot in parag.xpath(f"./{self._get_tag('QUOT.S')}"):
|
||||
content += (
|
||||
f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>'
|
||||
)
|
||||
|
||||
return f'<div class="paragraph" data-paragraph-id="{parag_id}"><span class="paragraph-number">{parag_num}</span>{content}</div>'
|
||||
return f'<div class="paragraph" data-paragraph-id="{parag_id}">{content}</div>'
|
||||
|
||||
def _convert_subdiv(self, subdiv: ET._Element) -> str:
|
||||
"""Convert a SUBDIV (subdivision) element to HTML."""
|
||||
# Get the title using XPath
|
||||
title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}")
|
||||
"""Convert a SUBDIV (subdivision) element to HTML, preserving child order."""
|
||||
# Get the title using XPath (should be the first TITLE child if present)
|
||||
title = ""
|
||||
title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}")
|
||||
if title_elems:
|
||||
title_elem = title_elems[0]
|
||||
# Process TI (title) and STI (subtitle) elements
|
||||
@@ -400,30 +399,26 @@ class FormexArticleConverter:
|
||||
if sti_list:
|
||||
title += f'<h5 class="subdivision-subtitle">{" ".join(sti_list)}</h5>'
|
||||
|
||||
# Process content: either paragraphs, alineas, or nested subdivisions
|
||||
# Process all children in order, skipping TITLE (already handled)
|
||||
content = ""
|
||||
|
||||
# Process paragraphs directly under this subdivision
|
||||
for parag in subdiv.xpath(f"./{self._get_tag('PARAG')}"):
|
||||
content += self._convert_parag(parag)
|
||||
|
||||
# Process alineas directly under this subdivision
|
||||
for alinea in subdiv.xpath(f"./{self._get_tag('ALINEA')}"):
|
||||
content += self._convert_alinea(alinea)
|
||||
|
||||
# Process comments directly under this subdivision
|
||||
for comment in subdiv.xpath(f"./{self._get_tag('COMMENT')}"):
|
||||
content += f'<div class="comment">{self._convert_btx(comment)}</div>'
|
||||
|
||||
# Process quotations directly under this subdivision
|
||||
for quot in subdiv.xpath(f"./{self._get_tag('QUOT.S')}"):
|
||||
content += (
|
||||
f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>'
|
||||
)
|
||||
|
||||
# Process nested subdivisions directly under this subdivision
|
||||
for sub in subdiv.xpath(f"./{self._get_tag('SUBDIV')}"):
|
||||
content += self._convert_subdiv(sub)
|
||||
for child in subdiv.iterchildren(tag="*"):
|
||||
child_tag = child.tag.replace(self.ns_prefix, "")
|
||||
if child_tag == "TITLE":
|
||||
continue # already handled
|
||||
elif child_tag == "PARAG":
|
||||
content += self._convert_parag(child)
|
||||
elif child_tag == "ALINEA":
|
||||
content += self._convert_alinea(child)
|
||||
elif child_tag == "COMMENT":
|
||||
content += f'<div class="comment">{self._convert_btx(child)}</div>'
|
||||
elif child_tag == "QUOT.S":
|
||||
content += f'<blockquote class="quotation">{self._convert_btx(child)}</blockquote>'
|
||||
elif child_tag == "SUBDIV":
|
||||
content += self._convert_subdiv(child)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Unexpected child element '{child_tag}' in SUBDIV: {text_content(child)}"
|
||||
)
|
||||
|
||||
return f'<section class="subdivision">{title}{content}</section>'
|
||||
|
||||
@@ -477,35 +472,25 @@ class FormexArticleConverter:
|
||||
# Process the content based on what's present
|
||||
content = ""
|
||||
|
||||
# Check if we have alineas directly under the article
|
||||
alineas = article.xpath(f"./{self._get_tag('ALINEA')}")
|
||||
if alineas:
|
||||
for alinea in alineas:
|
||||
content += self._convert_alinea(alinea)
|
||||
|
||||
# Check if we have paragraphs directly under the article
|
||||
parags = article.xpath(f"./{self._get_tag('PARAG')}")
|
||||
if parags:
|
||||
for parag in parags:
|
||||
content += self._convert_parag(parag)
|
||||
|
||||
# Check for comments directly under the article
|
||||
comments = article.xpath(f"./{self._get_tag('COMMENT')}")
|
||||
if comments:
|
||||
for comment in comments:
|
||||
content += f'<div class="comment">{self._convert_btx(comment)}</div>'
|
||||
|
||||
# Check for quotations directly under the article
|
||||
quots = article.xpath(f"./{self._get_tag('QUOT.S')}")
|
||||
if quots:
|
||||
for quot in quots:
|
||||
content += f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>'
|
||||
|
||||
# Check for subdivisions directly under the article
|
||||
subdivs = article.xpath(f"./{self._get_tag('SUBDIV')}")
|
||||
if subdivs:
|
||||
for subdiv in subdivs:
|
||||
content += self._convert_subdiv(subdiv)
|
||||
# Process all child elements (except TITLE) in tree order
|
||||
for child in article.iterchildren(tag="*"):
|
||||
child_tag = child.tag.replace(self.ns_prefix, "")
|
||||
if child_tag in ["TI.ART", "STI.ART"]:
|
||||
continue # already handled
|
||||
elif child_tag == "ALINEA":
|
||||
content += self._convert_alinea(child)
|
||||
elif child_tag == "PARAG":
|
||||
content += self._convert_parag(child)
|
||||
elif child_tag == "COMMENT":
|
||||
content += f'<div class="comment">{self._convert_btx(child)}</div>'
|
||||
elif child_tag == "QUOT.S":
|
||||
content += f'<blockquote class="quotation">{self._convert_btx(child)}</blockquote>'
|
||||
elif child_tag == "SUBDIV":
|
||||
content += self._convert_subdiv(child)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Unexpected child element '{child_tag}' in ARTICLE: {text_content(child)}"
|
||||
)
|
||||
|
||||
# Assemble the complete article
|
||||
return f'<article id="{article_id}" class="formex-article">{header}<div class="article-content">{content}</div></article>'
|
||||
|
||||
Reference in New Issue
Block a user