fix: Preserve XML tree order in Formex parser

2025-05-20 09:05:32 +02:00
parent 56b5e3e3a4
commit f0d4214d17
4 changed files with 524 additions and 432 deletions
--- a/src/formex_viewer/formex4.py
+++ b/src/formex_viewer/formex4.py
@@ -81,7 +81,7 @@ def extract_xrefs(el: ET._Element, language: Language) -> list[CrossReference]:
            )
            parag_num = match.groupdict().get("parag_num")

-            if not parag_num or key not in ["article", "annex"]:
+            if key not in ["article", "annex"]:
                raise RuntimeError()

            crossref_text = match.group(0)
@@ -89,7 +89,7 @@ def extract_xrefs(el: ET._Element, language: Language) -> list[CrossReference]:
                CrossReference(
                    target=key,
                    id=crossref_id,
-                    paragraph=int(parag_num),
+                    paragraph=int(parag_num) if parag_num else None,
                    text=crossref_text,
                )
            )
@@ -360,32 +360,31 @@ class FormexArticleConverter:
        identifier = parag.get("IDENTIFIER", "")
        parag_id = self._create_id(identifier) if identifier else ""

-        # Get the paragraph number using XPath
-        no_parag_elems = parag.xpath(f"./{self._get_tag('NO.PARAG')}")
-        parag_num = self._get_text(no_parag_elems[0]) if no_parag_elems else ""
-
-        # Process the alineas within the paragraph
        content = ""
-        for alinea in parag.xpath(f"./{self._get_tag('ALINEA')}"):
-            content += self._convert_alinea(alinea)
+        for child in parag.iterchildren(tag="*"):
+            child_tag = child.tag.replace(self.ns_prefix, "")
+            if child_tag == "ALINEA":
+                content += self._convert_alinea(child)
+            elif child_tag == "COMMENT":
+                content += f'<div class="comment">{self._convert_btx(child)}</div>'
+            elif child_tag == "QUOT.S":
+                content += f'<blockquote class="quotation">{self._convert_btx(child)}</blockquote>'
+            elif child_tag == "NO.PARAG":
+                content += (
+                    f'<span class="paragraph-number">{self._convert_btx(child)}</span>'
+                )
+            else:
+                raise RuntimeError(
+                    f"Unexpected child element '{child_tag}' in PARAG: {text_content(child)}"
+                )

-        # Process any comments
-        for comment in parag.xpath(f"./{self._get_tag('COMMENT')}"):
-            content += f'<div class="comment">{self._convert_btx(comment)}</div>'
-
-        # Process any quotations
-        for quot in parag.xpath(f"./{self._get_tag('QUOT.S')}"):
-            content += (
-                f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>'
-            )
-
-        return f'<div class="paragraph" data-paragraph-id="{parag_id}"><span class="paragraph-number">{parag_num}</span>{content}</div>'
+        return f'<div class="paragraph" data-paragraph-id="{parag_id}">{content}</div>'

    def _convert_subdiv(self, subdiv: ET._Element) -> str:
-        """Convert a SUBDIV (subdivision) element to HTML."""
-        # Get the title using XPath
-        title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}")
+        """Convert a SUBDIV (subdivision) element to HTML, preserving child order."""
+        # Get the title using XPath (should be the first TITLE child if present)
        title = ""
+        title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}")
        if title_elems:
            title_elem = title_elems[0]
            # Process TI (title) and STI (subtitle) elements
@@ -400,30 +399,26 @@ class FormexArticleConverter:
            if sti_list:
                title += f'<h5 class="subdivision-subtitle">{" ".join(sti_list)}</h5>'

-        # Process content: either paragraphs, alineas, or nested subdivisions
+        # Process all children in order, skipping TITLE (already handled)
        content = ""
-
-        # Process paragraphs directly under this subdivision
-        for parag in subdiv.xpath(f"./{self._get_tag('PARAG')}"):
-            content += self._convert_parag(parag)
-
-        # Process alineas directly under this subdivision
-        for alinea in subdiv.xpath(f"./{self._get_tag('ALINEA')}"):
-            content += self._convert_alinea(alinea)
-
-        # Process comments directly under this subdivision
-        for comment in subdiv.xpath(f"./{self._get_tag('COMMENT')}"):
-            content += f'<div class="comment">{self._convert_btx(comment)}</div>'
-
-        # Process quotations directly under this subdivision
-        for quot in subdiv.xpath(f"./{self._get_tag('QUOT.S')}"):
-            content += (
-                f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>'
-            )
-
-        # Process nested subdivisions directly under this subdivision
-        for sub in subdiv.xpath(f"./{self._get_tag('SUBDIV')}"):
-            content += self._convert_subdiv(sub)
+        for child in subdiv.iterchildren(tag="*"):
+            child_tag = child.tag.replace(self.ns_prefix, "")
+            if child_tag == "TITLE":
+                continue  # already handled
+            elif child_tag == "PARAG":
+                content += self._convert_parag(child)
+            elif child_tag == "ALINEA":
+                content += self._convert_alinea(child)
+            elif child_tag == "COMMENT":
+                content += f'<div class="comment">{self._convert_btx(child)}</div>'
+            elif child_tag == "QUOT.S":
+                content += f'<blockquote class="quotation">{self._convert_btx(child)}</blockquote>'
+            elif child_tag == "SUBDIV":
+                content += self._convert_subdiv(child)
+            else:
+                raise RuntimeError(
+                    f"Unexpected child element '{child_tag}' in SUBDIV: {text_content(child)}"
+                )

        return f'<section class="subdivision">{title}{content}</section>'

@@ -477,35 +472,25 @@ class FormexArticleConverter:
        # Process the content based on what's present
        content = ""

-        # Check if we have alineas directly under the article
-        alineas = article.xpath(f"./{self._get_tag('ALINEA')}")
-        if alineas:
-            for alinea in alineas:
-                content += self._convert_alinea(alinea)
-
-        # Check if we have paragraphs directly under the article
-        parags = article.xpath(f"./{self._get_tag('PARAG')}")
-        if parags:
-            for parag in parags:
-                content += self._convert_parag(parag)
-
-        # Check for comments directly under the article
-        comments = article.xpath(f"./{self._get_tag('COMMENT')}")
-        if comments:
-            for comment in comments:
-                content += f'<div class="comment">{self._convert_btx(comment)}</div>'
-
-        # Check for quotations directly under the article
-        quots = article.xpath(f"./{self._get_tag('QUOT.S')}")
-        if quots:
-            for quot in quots:
-                content += f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>'
-
-        # Check for subdivisions directly under the article
-        subdivs = article.xpath(f"./{self._get_tag('SUBDIV')}")
-        if subdivs:
-            for subdiv in subdivs:
-                content += self._convert_subdiv(subdiv)
+        # Process all child elements (except TITLE) in tree order
+        for child in article.iterchildren(tag="*"):
+            child_tag = child.tag.replace(self.ns_prefix, "")
+            if child_tag in ["TI.ART", "STI.ART"]:
+                continue  # already handled
+            elif child_tag == "ALINEA":
+                content += self._convert_alinea(child)
+            elif child_tag == "PARAG":
+                content += self._convert_parag(child)
+            elif child_tag == "COMMENT":
+                content += f'<div class="comment">{self._convert_btx(child)}</div>'
+            elif child_tag == "QUOT.S":
+                content += f'<blockquote class="quotation">{self._convert_btx(child)}</blockquote>'
+            elif child_tag == "SUBDIV":
+                content += self._convert_subdiv(child)
+            else:
+                raise RuntimeError(
+                    f"Unexpected child element '{child_tag}' in ARTICLE: {text_content(child)}"
+                )

        # Assemble the complete article
        return f'<article id="{article_id}" class="formex-article">{header}<div class="article-content">{content}</div></article>'