fix: Preserve XML tree order in Formex parser
This commit is contained in:
		| @@ -81,7 +81,7 @@ def extract_xrefs(el: ET._Element, language: Language) -> list[CrossReference]: | ||||
|             ) | ||||
|             parag_num = match.groupdict().get("parag_num") | ||||
|  | ||||
|             if not parag_num or key not in ["article", "annex"]: | ||||
|             if key not in ["article", "annex"]: | ||||
|                 raise RuntimeError() | ||||
|  | ||||
|             crossref_text = match.group(0) | ||||
| @@ -89,7 +89,7 @@ def extract_xrefs(el: ET._Element, language: Language) -> list[CrossReference]: | ||||
|                 CrossReference( | ||||
|                     target=key, | ||||
|                     id=crossref_id, | ||||
|                     paragraph=int(parag_num), | ||||
|                     paragraph=int(parag_num) if parag_num else None, | ||||
|                     text=crossref_text, | ||||
|                 ) | ||||
|             ) | ||||
| @@ -360,32 +360,31 @@ class FormexArticleConverter: | ||||
|         identifier = parag.get("IDENTIFIER", "") | ||||
|         parag_id = self._create_id(identifier) if identifier else "" | ||||
|  | ||||
|         # Get the paragraph number using XPath | ||||
|         no_parag_elems = parag.xpath(f"./{self._get_tag('NO.PARAG')}") | ||||
|         parag_num = self._get_text(no_parag_elems[0]) if no_parag_elems else "" | ||||
|  | ||||
|         # Process the alineas within the paragraph | ||||
|         content = "" | ||||
|         for alinea in parag.xpath(f"./{self._get_tag('ALINEA')}"): | ||||
|             content += self._convert_alinea(alinea) | ||||
|         for child in parag.iterchildren(tag="*"): | ||||
|             child_tag = child.tag.replace(self.ns_prefix, "") | ||||
|             if child_tag == "ALINEA": | ||||
|                 content += self._convert_alinea(child) | ||||
|             elif child_tag == "COMMENT": | ||||
|                 content += f'<div class="comment">{self._convert_btx(child)}</div>' | ||||
|             elif child_tag == "QUOT.S": | ||||
|                 content += f'<blockquote class="quotation">{self._convert_btx(child)}</blockquote>' | ||||
|             elif child_tag == "NO.PARAG": | ||||
|                 content += ( | ||||
|                     f'<span class="paragraph-number">{self._convert_btx(child)}</span>' | ||||
|                 ) | ||||
|             else: | ||||
|                 raise RuntimeError( | ||||
|                     f"Unexpected child element '{child_tag}' in PARAG: {text_content(child)}" | ||||
|                 ) | ||||
|  | ||||
|         # Process any comments | ||||
|         for comment in parag.xpath(f"./{self._get_tag('COMMENT')}"): | ||||
|             content += f'<div class="comment">{self._convert_btx(comment)}</div>' | ||||
|  | ||||
|         # Process any quotations | ||||
|         for quot in parag.xpath(f"./{self._get_tag('QUOT.S')}"): | ||||
|             content += ( | ||||
|                 f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>' | ||||
|             ) | ||||
|  | ||||
|         return f'<div class="paragraph" data-paragraph-id="{parag_id}"><span class="paragraph-number">{parag_num}</span>{content}</div>' | ||||
|         return f'<div class="paragraph" data-paragraph-id="{parag_id}">{content}</div>' | ||||
|  | ||||
|     def _convert_subdiv(self, subdiv: ET._Element) -> str: | ||||
|         """Convert a SUBDIV (subdivision) element to HTML.""" | ||||
|         # Get the title using XPath | ||||
|         title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}") | ||||
|         """Convert a SUBDIV (subdivision) element to HTML, preserving child order.""" | ||||
|         # Get the title using XPath (should be the first TITLE child if present) | ||||
|         title = "" | ||||
|         title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}") | ||||
|         if title_elems: | ||||
|             title_elem = title_elems[0] | ||||
|             # Process TI (title) and STI (subtitle) elements | ||||
| @@ -400,30 +399,26 @@ class FormexArticleConverter: | ||||
|             if sti_list: | ||||
|                 title += f'<h5 class="subdivision-subtitle">{" ".join(sti_list)}</h5>' | ||||
|  | ||||
|         # Process content: either paragraphs, alineas, or nested subdivisions | ||||
|         # Process all children in order, skipping TITLE (already handled) | ||||
|         content = "" | ||||
|  | ||||
|         # Process paragraphs directly under this subdivision | ||||
|         for parag in subdiv.xpath(f"./{self._get_tag('PARAG')}"): | ||||
|             content += self._convert_parag(parag) | ||||
|  | ||||
|         # Process alineas directly under this subdivision | ||||
|         for alinea in subdiv.xpath(f"./{self._get_tag('ALINEA')}"): | ||||
|             content += self._convert_alinea(alinea) | ||||
|  | ||||
|         # Process comments directly under this subdivision | ||||
|         for comment in subdiv.xpath(f"./{self._get_tag('COMMENT')}"): | ||||
|             content += f'<div class="comment">{self._convert_btx(comment)}</div>' | ||||
|  | ||||
|         # Process quotations directly under this subdivision | ||||
|         for quot in subdiv.xpath(f"./{self._get_tag('QUOT.S')}"): | ||||
|             content += ( | ||||
|                 f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>' | ||||
|             ) | ||||
|  | ||||
|         # Process nested subdivisions directly under this subdivision | ||||
|         for sub in subdiv.xpath(f"./{self._get_tag('SUBDIV')}"): | ||||
|             content += self._convert_subdiv(sub) | ||||
|         for child in subdiv.iterchildren(tag="*"): | ||||
|             child_tag = child.tag.replace(self.ns_prefix, "") | ||||
|             if child_tag == "TITLE": | ||||
|                 continue  # already handled | ||||
|             elif child_tag == "PARAG": | ||||
|                 content += self._convert_parag(child) | ||||
|             elif child_tag == "ALINEA": | ||||
|                 content += self._convert_alinea(child) | ||||
|             elif child_tag == "COMMENT": | ||||
|                 content += f'<div class="comment">{self._convert_btx(child)}</div>' | ||||
|             elif child_tag == "QUOT.S": | ||||
|                 content += f'<blockquote class="quotation">{self._convert_btx(child)}</blockquote>' | ||||
|             elif child_tag == "SUBDIV": | ||||
|                 content += self._convert_subdiv(child) | ||||
|             else: | ||||
|                 raise RuntimeError( | ||||
|                     f"Unexpected child element '{child_tag}' in SUBDIV: {text_content(child)}" | ||||
|                 ) | ||||
|  | ||||
|         return f'<section class="subdivision">{title}{content}</section>' | ||||
|  | ||||
| @@ -477,35 +472,25 @@ class FormexArticleConverter: | ||||
|         # Process the content based on what's present | ||||
|         content = "" | ||||
|  | ||||
|         # Check if we have alineas directly under the article | ||||
|         alineas = article.xpath(f"./{self._get_tag('ALINEA')}") | ||||
|         if alineas: | ||||
|             for alinea in alineas: | ||||
|                 content += self._convert_alinea(alinea) | ||||
|  | ||||
|         # Check if we have paragraphs directly under the article | ||||
|         parags = article.xpath(f"./{self._get_tag('PARAG')}") | ||||
|         if parags: | ||||
|             for parag in parags: | ||||
|                 content += self._convert_parag(parag) | ||||
|  | ||||
|         # Check for comments directly under the article | ||||
|         comments = article.xpath(f"./{self._get_tag('COMMENT')}") | ||||
|         if comments: | ||||
|             for comment in comments: | ||||
|                 content += f'<div class="comment">{self._convert_btx(comment)}</div>' | ||||
|  | ||||
|         # Check for quotations directly under the article | ||||
|         quots = article.xpath(f"./{self._get_tag('QUOT.S')}") | ||||
|         if quots: | ||||
|             for quot in quots: | ||||
|                 content += f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>' | ||||
|  | ||||
|         # Check for subdivisions directly under the article | ||||
|         subdivs = article.xpath(f"./{self._get_tag('SUBDIV')}") | ||||
|         if subdivs: | ||||
|             for subdiv in subdivs: | ||||
|                 content += self._convert_subdiv(subdiv) | ||||
|         # Process all child elements (except TITLE) in tree order | ||||
|         for child in article.iterchildren(tag="*"): | ||||
|             child_tag = child.tag.replace(self.ns_prefix, "") | ||||
|             if child_tag in ["TI.ART", "STI.ART"]: | ||||
|                 continue  # already handled | ||||
|             elif child_tag == "ALINEA": | ||||
|                 content += self._convert_alinea(child) | ||||
|             elif child_tag == "PARAG": | ||||
|                 content += self._convert_parag(child) | ||||
|             elif child_tag == "COMMENT": | ||||
|                 content += f'<div class="comment">{self._convert_btx(child)}</div>' | ||||
|             elif child_tag == "QUOT.S": | ||||
|                 content += f'<blockquote class="quotation">{self._convert_btx(child)}</blockquote>' | ||||
|             elif child_tag == "SUBDIV": | ||||
|                 content += self._convert_subdiv(child) | ||||
|             else: | ||||
|                 raise RuntimeError( | ||||
|                     f"Unexpected child element '{child_tag}' in ARTICLE: {text_content(child)}" | ||||
|                 ) | ||||
|  | ||||
|         # Assemble the complete article | ||||
|         return f'<article id="{article_id}" class="formex-article">{header}<div class="article-content">{content}</div></article>' | ||||
|   | ||||
		Reference in New Issue
	
	Block a user