Compare commits
	
		
			5 Commits
		
	
	
		
			1d467c827a
			...
			main
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 58bd1160c1 | ||
|  | debaf567ea | ||
|  | 56d271d0df | ||
|  | f0d4214d17 | ||
|  | 56b5e3e3a4 | 
| @@ -12,7 +12,13 @@ describe("CelexSelector", () => { | ||||
|     expect(getByLabelText("Select example:")).toBeInTheDocument(); | ||||
|     expect(getByRole("combobox")).toBeInTheDocument(); | ||||
|  | ||||
|     const options = getAllByRole("option"); | ||||
|     const [def, ...options] = getAllByRole("option"); | ||||
|  | ||||
|     // First option is the disabled placeholder option | ||||
|     expect(def).toHaveValue(""); | ||||
|     expect(def).toHaveTextContent("Select an example"); | ||||
|     expect(def).toBeDisabled(); | ||||
|  | ||||
|     expect(options).toHaveLength(examples.length); | ||||
|     for (const i in examples) { | ||||
|       expect(options[i]).toHaveValue(examples[i].id); | ||||
|   | ||||
| @@ -1,16 +1,15 @@ | ||||
| .toc { | ||||
|   font-size: 0.8rem; | ||||
|   min-width: 25vw; | ||||
|   flex: 1 auto; | ||||
|   flex: 1 0 25vw; | ||||
|  | ||||
|   &.hidden { | ||||
|     flex: 0 0; | ||||
|     min-width: 0; | ||||
|     display: none; | ||||
|   } | ||||
|  | ||||
|   transition: flex-basis 0.1s ease-in-out; | ||||
|  | ||||
|   overflow-y: scroll; | ||||
|   overflow-x: wrap; | ||||
|   height: 100vh; | ||||
|  | ||||
|   .tocDivision { | ||||
|     margin-block: 0.5rem; | ||||
|   | ||||
| @@ -55,15 +55,17 @@ function TOC({ toc }: TOCProps) { | ||||
|   const [isVisible, setIsVisible] = useState(true); | ||||
|  | ||||
|   return ( | ||||
|     <nav className={[styles.toc, isVisible ? "" : styles.hidden].join(" ")}> | ||||
|     <> | ||||
|       <button | ||||
|         onClick={() => setIsVisible(!isVisible)} | ||||
|         className={styles.toggleButton} | ||||
|       > | ||||
|         {isVisible ? "<" : ">"} | ||||
|       </button> | ||||
|       {toc.map((division) => renderDivision(division))} | ||||
|     </nav> | ||||
|       <nav className={[styles.toc, isVisible ? "" : styles.hidden].join(" ")}> | ||||
|         {toc.map((division) => renderDivision(division))} | ||||
|       </nav> | ||||
|     </> | ||||
|   ); | ||||
| } | ||||
| export default TOC; | ||||
|   | ||||
| @@ -1,5 +1,9 @@ | ||||
| export const examples = [ | ||||
|   { name: "GDPR", id: "32016R0679" }, | ||||
|   { name: "AI Act", id: "32024R1689" }, | ||||
|   { name: "Cybersecurity Act", id: "32019R0881" }, | ||||
|   { name: "Cyber Resilience Act", id: "32024R2847" }, | ||||
|   { name: "Medical Device Regulation", id: "32017R0745" }, | ||||
|   { name: "NIS 2 Directive", id: "32022L2555" }, | ||||
|   { name: "Digital Services Act", id: "32022R2065" }, | ||||
| ]; | ||||
|   | ||||
| @@ -20,3 +20,8 @@ formex-viewer = "formex_viewer:main" | ||||
| [build-system] | ||||
| requires = ["hatchling"] | ||||
| build-backend = "hatchling.build" | ||||
|  | ||||
| [dependency-groups] | ||||
| dev = [ | ||||
|     "pytest>=8.3.5", | ||||
| ] | ||||
|   | ||||
| @@ -2,7 +2,7 @@ import html | ||||
| import re | ||||
| import warnings | ||||
| from dataclasses import dataclass | ||||
| from typing import Literal, Optional, Union | ||||
| from typing import Literal, Optional, Union, cast | ||||
|  | ||||
| import lxml.etree | ||||
| from lxml import etree as ET | ||||
| @@ -10,7 +10,7 @@ from lxml import etree as ET | ||||
| from formex_viewer.main import Language | ||||
|  | ||||
|  | ||||
| def text_content(el: lxml.etree.Element) -> str: | ||||
| def text_content(el: ET._Element) -> str: | ||||
|     """Get the text content of an XML element, including all child elements.""" | ||||
|  | ||||
|     def _iterate(el): | ||||
| @@ -35,7 +35,7 @@ class CrossReference: | ||||
|     paragraph: int | None = None | ||||
|  | ||||
|  | ||||
| def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossReference]: | ||||
| def extract_xrefs(el: ET._Element, language: Language) -> list[CrossReference]: | ||||
|     """Extract cross-references from an XML element. | ||||
|  | ||||
|     Args: | ||||
| @@ -80,19 +80,23 @@ def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossRefer | ||||
|                 match.group("art_num") if key == "article" else match.group("annex_num") | ||||
|             ) | ||||
|             parag_num = match.groupdict().get("parag_num") | ||||
|  | ||||
|             if key not in ["article", "annex"]: | ||||
|                 raise RuntimeError() | ||||
|  | ||||
|             crossref_text = match.group(0) | ||||
|             crossrefs.append( | ||||
|                 CrossReference( | ||||
|                     target=key, | ||||
|                     id=crossref_id, | ||||
|                     paragraph=parag_num, | ||||
|                     paragraph=int(parag_num) if parag_num else None, | ||||
|                     text=crossref_text, | ||||
|                 ) | ||||
|             ) | ||||
|     return crossrefs | ||||
|  | ||||
|  | ||||
| def extract_article(doc: ET.ElementBase, article_id: int) -> ET.ElementBase | None: | ||||
| def extract_article(doc: ET._Element, article_id: int) -> ET._Element | None: | ||||
|     """Extract a specific article from a Formex document. | ||||
|  | ||||
|     Args: | ||||
| @@ -109,8 +113,8 @@ def extract_article(doc: ET.ElementBase, article_id: int) -> ET.ElementBase | No | ||||
|  | ||||
|  | ||||
| def extract_paragraph( | ||||
|     doc: ET.ElementBase, article_id: int, paragraph_id: int | ||||
| ) -> ET.ElementBase | None: | ||||
|     doc: ET._Element, article_id: int, paragraph_id: int | ||||
| ) -> ET._Element | None: | ||||
|     """Extract a specific paragraph from an article in a Formex document. | ||||
|  | ||||
|     Args: | ||||
| @@ -146,7 +150,7 @@ class FormexArticleConverter: | ||||
|         """Get the tag name with namespace if available.""" | ||||
|         return f"{self.ns_prefix}{tag}" | ||||
|  | ||||
|     def _get_text(self, element: ET.Element) -> str: | ||||
|     def _get_text(self, element: ET._Element) -> str: | ||||
|         """Get the text content of an element, including all nested text. | ||||
|  | ||||
|         This uses lxml's text_content() method when available, falling back to | ||||
| @@ -161,7 +165,7 @@ class FormexArticleConverter: | ||||
|         except AttributeError: | ||||
|             # Fall back to manual traversal if text_content() is not available | ||||
|             text = element.text or "" | ||||
|             for child in element: | ||||
|             for child in element.iterchildren(tag="*"): | ||||
|                 text += self._get_text(child) | ||||
|                 if child.tail: | ||||
|                     text += child.tail | ||||
| @@ -182,7 +186,7 @@ class FormexArticleConverter: | ||||
|         ) | ||||
|         return text | ||||
|  | ||||
|     def _convert_btx(self, element: ET.Element) -> str: | ||||
|     def _convert_btx(self, element: ET._Element) -> str: | ||||
|         """ | ||||
|         Convert basic text elements (t_btx, t_btx.seq) to HTML. | ||||
|  | ||||
| @@ -202,7 +206,7 @@ class FormexArticleConverter: | ||||
|                 # Replace the cross-reference text with a link | ||||
|                 result = self._replace_xref(result, xref) | ||||
|  | ||||
|         for child in element: | ||||
|         for child in element.iterchildren(tag="*"): | ||||
|             child_tag = child.tag.replace(self.ns_prefix, "") | ||||
|  | ||||
|             # Process common inline elements | ||||
| @@ -309,7 +313,7 @@ class FormexArticleConverter: | ||||
|  | ||||
|         return result | ||||
|  | ||||
|     def _convert_list(self, list_element: ET.Element) -> str: | ||||
|     def _convert_list(self, list_element: ET._Element) -> str: | ||||
|         """Convert a Formex LIST element to HTML list items.""" | ||||
|         result = "" | ||||
|         # Using lxml's xpath to get direct child ITEM elements | ||||
| @@ -347,41 +351,40 @@ class FormexArticleConverter: | ||||
|  | ||||
|         return result | ||||
|  | ||||
|     def _convert_alinea(self, alinea: ET.Element) -> str: | ||||
|     def _convert_alinea(self, alinea: ET._Element) -> str: | ||||
|         """Convert an ALINEA element to HTML.""" | ||||
|         return f'<p class="alinea">{self._convert_btx(alinea)}</p>' | ||||
|  | ||||
|     def _convert_parag(self, parag: ET.Element) -> str: | ||||
|     def _convert_parag(self, parag: ET._Element) -> str: | ||||
|         """Convert a PARAG (paragraph) element to HTML.""" | ||||
|         identifier = parag.get("IDENTIFIER", "") | ||||
|         parag_id = self._create_id(identifier) if identifier else "" | ||||
|  | ||||
|         # Get the paragraph number using XPath | ||||
|         no_parag_elems = parag.xpath(f"./{self._get_tag('NO.PARAG')}") | ||||
|         parag_num = self._get_text(no_parag_elems[0]) if no_parag_elems else "" | ||||
|  | ||||
|         # Process the alineas within the paragraph | ||||
|         content = "" | ||||
|         for alinea in parag.xpath(f"./{self._get_tag('ALINEA')}"): | ||||
|             content += self._convert_alinea(alinea) | ||||
|         for child in parag.iterchildren(tag="*"): | ||||
|             child_tag = child.tag.replace(self.ns_prefix, "") | ||||
|             if child_tag == "ALINEA": | ||||
|                 content += self._convert_alinea(child) | ||||
|             elif child_tag == "COMMENT": | ||||
|                 content += f'<div class="comment">{self._convert_btx(child)}</div>' | ||||
|             elif child_tag == "QUOT.S": | ||||
|                 content += f'<blockquote class="quotation">{self._convert_btx(child)}</blockquote>' | ||||
|             elif child_tag == "NO.PARAG": | ||||
|                 content += ( | ||||
|                     f'<span class="paragraph-number">{self._convert_btx(child)}</span>' | ||||
|                 ) | ||||
|             else: | ||||
|                 raise RuntimeError( | ||||
|                     f"Unexpected child element '{child_tag}' in PARAG: {text_content(child)}" | ||||
|                 ) | ||||
|  | ||||
|         # Process any comments | ||||
|         for comment in parag.xpath(f"./{self._get_tag('COMMENT')}"): | ||||
|             content += f'<div class="comment">{self._convert_btx(comment)}</div>' | ||||
|         return f'<div class="paragraph" data-paragraph-id="{parag_id}">{content}</div>' | ||||
|  | ||||
|         # Process any quotations | ||||
|         for quot in parag.xpath(f"./{self._get_tag('QUOT.S')}"): | ||||
|             content += ( | ||||
|                 f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>' | ||||
|             ) | ||||
|  | ||||
|         return f'<div class="paragraph" data-paragraph-id="{parag_id}"><span class="paragraph-number">{parag_num}</span>{content}</div>' | ||||
|  | ||||
|     def _convert_subdiv(self, subdiv: ET.Element) -> str: | ||||
|         """Convert a SUBDIV (subdivision) element to HTML.""" | ||||
|         # Get the title using XPath | ||||
|         title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}") | ||||
|     def _convert_subdiv(self, subdiv: ET._Element) -> str: | ||||
|         """Convert a SUBDIV (subdivision) element to HTML, preserving child order.""" | ||||
|         # Get the title using XPath (should be the first TITLE child if present) | ||||
|         title = "" | ||||
|         title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}") | ||||
|         if title_elems: | ||||
|             title_elem = title_elems[0] | ||||
|             # Process TI (title) and STI (subtitle) elements | ||||
| @@ -396,34 +399,30 @@ class FormexArticleConverter: | ||||
|             if sti_list: | ||||
|                 title += f'<h5 class="subdivision-subtitle">{" ".join(sti_list)}</h5>' | ||||
|  | ||||
|         # Process content: either paragraphs, alineas, or nested subdivisions | ||||
|         # Process all children in order, skipping TITLE (already handled) | ||||
|         content = "" | ||||
|  | ||||
|         # Process paragraphs directly under this subdivision | ||||
|         for parag in subdiv.xpath(f"./{self._get_tag('PARAG')}"): | ||||
|             content += self._convert_parag(parag) | ||||
|  | ||||
|         # Process alineas directly under this subdivision | ||||
|         for alinea in subdiv.xpath(f"./{self._get_tag('ALINEA')}"): | ||||
|             content += self._convert_alinea(alinea) | ||||
|  | ||||
|         # Process comments directly under this subdivision | ||||
|         for comment in subdiv.xpath(f"./{self._get_tag('COMMENT')}"): | ||||
|             content += f'<div class="comment">{self._convert_btx(comment)}</div>' | ||||
|  | ||||
|         # Process quotations directly under this subdivision | ||||
|         for quot in subdiv.xpath(f"./{self._get_tag('QUOT.S')}"): | ||||
|             content += ( | ||||
|                 f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>' | ||||
|             ) | ||||
|  | ||||
|         # Process nested subdivisions directly under this subdivision | ||||
|         for sub in subdiv.xpath(f"./{self._get_tag('SUBDIV')}"): | ||||
|             content += self._convert_subdiv(sub) | ||||
|         for child in subdiv.iterchildren(tag="*"): | ||||
|             child_tag = child.tag.replace(self.ns_prefix, "") | ||||
|             if child_tag == "TITLE": | ||||
|                 continue  # already handled | ||||
|             elif child_tag == "PARAG": | ||||
|                 content += self._convert_parag(child) | ||||
|             elif child_tag == "ALINEA": | ||||
|                 content += self._convert_alinea(child) | ||||
|             elif child_tag == "COMMENT": | ||||
|                 content += f'<div class="comment">{self._convert_btx(child)}</div>' | ||||
|             elif child_tag == "QUOT.S": | ||||
|                 content += f'<blockquote class="quotation">{self._convert_btx(child)}</blockquote>' | ||||
|             elif child_tag == "SUBDIV": | ||||
|                 content += self._convert_subdiv(child) | ||||
|             else: | ||||
|                 raise RuntimeError( | ||||
|                     f"Unexpected child element '{child_tag}' in SUBDIV: {text_content(child)}" | ||||
|                 ) | ||||
|  | ||||
|         return f'<section class="subdivision">{title}{content}</section>' | ||||
|  | ||||
|     def convert_article(self, article: Union[str, ET.Element]) -> str: | ||||
|     def convert_article(self, article: Union[str, ET._Element]) -> str: | ||||
|         """ | ||||
|         Convert a Formex <ARTICLE> element to HTML5. | ||||
|  | ||||
| @@ -437,7 +436,9 @@ class FormexArticleConverter: | ||||
|         if isinstance(article, str): | ||||
|             try: | ||||
|                 parser = ET.XMLParser(remove_blank_text=True) | ||||
|                 article = ET.fromstring(article.encode("utf-8"), parser) | ||||
|                 article = cast( | ||||
|                     ET._Element, ET.fromstring(article.encode("utf-8"), parser) | ||||
|                 ) | ||||
|             except ET.XMLSyntaxError as e: | ||||
|                 return f"<p>Error parsing XML: {e}</p>" | ||||
|  | ||||
| @@ -471,35 +472,25 @@ class FormexArticleConverter: | ||||
|         # Process the content based on what's present | ||||
|         content = "" | ||||
|  | ||||
|         # Check if we have alineas directly under the article | ||||
|         alineas = article.xpath(f"./{self._get_tag('ALINEA')}") | ||||
|         if alineas: | ||||
|             for alinea in alineas: | ||||
|                 content += self._convert_alinea(alinea) | ||||
|  | ||||
|         # Check if we have paragraphs directly under the article | ||||
|         parags = article.xpath(f"./{self._get_tag('PARAG')}") | ||||
|         if parags: | ||||
|             for parag in parags: | ||||
|                 content += self._convert_parag(parag) | ||||
|  | ||||
|         # Check for comments directly under the article | ||||
|         comments = article.xpath(f"./{self._get_tag('COMMENT')}") | ||||
|         if comments: | ||||
|             for comment in comments: | ||||
|                 content += f'<div class="comment">{self._convert_btx(comment)}</div>' | ||||
|  | ||||
|         # Check for quotations directly under the article | ||||
|         quots = article.xpath(f"./{self._get_tag('QUOT.S')}") | ||||
|         if quots: | ||||
|             for quot in quots: | ||||
|                 content += f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>' | ||||
|  | ||||
|         # Check for subdivisions directly under the article | ||||
|         subdivs = article.xpath(f"./{self._get_tag('SUBDIV')}") | ||||
|         if subdivs: | ||||
|             for subdiv in subdivs: | ||||
|                 content += self._convert_subdiv(subdiv) | ||||
|         # Process all child elements (except TITLE) in tree order | ||||
|         for child in article.iterchildren(tag="*"): | ||||
|             child_tag = child.tag.replace(self.ns_prefix, "") | ||||
|             if child_tag in ["TI.ART", "STI.ART"]: | ||||
|                 continue  # already handled | ||||
|             elif child_tag == "ALINEA": | ||||
|                 content += self._convert_alinea(child) | ||||
|             elif child_tag == "PARAG": | ||||
|                 content += self._convert_parag(child) | ||||
|             elif child_tag == "COMMENT": | ||||
|                 content += f'<div class="comment">{self._convert_btx(child)}</div>' | ||||
|             elif child_tag == "QUOT.S": | ||||
|                 content += f'<blockquote class="quotation">{self._convert_btx(child)}</blockquote>' | ||||
|             elif child_tag == "SUBDIV": | ||||
|                 content += self._convert_subdiv(child) | ||||
|             else: | ||||
|                 raise RuntimeError( | ||||
|                     f"Unexpected child element '{child_tag}' in ARTICLE: {text_content(child)}" | ||||
|                 ) | ||||
|  | ||||
|         # Assemble the complete article | ||||
|         return f'<article id="{article_id}" class="formex-article">{header}<div class="article-content">{content}</div></article>' | ||||
|   | ||||
| @@ -160,9 +160,14 @@ def paragraph( | ||||
|     """ | ||||
|     xml = _get_fmx4_data(celex_id, language) | ||||
|     parag = extract_paragraph(xml, article_id=article_id, paragraph_id=parag_id) | ||||
|     if parag is None: | ||||
|         return Response( | ||||
|             "Paragraph not found", | ||||
|             status_code=404, | ||||
|         ) | ||||
|  | ||||
|     return Response( | ||||
|         FormexArticleConverter(language=language).convert_article(parag), | ||||
|         FormexArticleConverter(language=language)._convert_parag(parag), | ||||
|         media_type="text/html", | ||||
|     ) | ||||
|  | ||||
|   | ||||
							
								
								
									
										52
									
								
								tests/test_parser.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										52
									
								
								tests/test_parser.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,52 @@ | ||||
| import pytest | ||||
| from lxml import etree as ET | ||||
|  | ||||
| from formex_viewer.formex4 import FormexArticleConverter | ||||
| from formex_viewer.main import Language | ||||
|  | ||||
|  | ||||
| @pytest.fixture | ||||
| def converter(): | ||||
|     return FormexArticleConverter(language=Language.ENG) | ||||
|  | ||||
|  | ||||
| def test_convert_tree_order(converter): | ||||
|     """Test that the order of HTML blocks in the converted article matches the order of elements in the XML tree.""" | ||||
|  | ||||
|     xml = """ | ||||
|     <ARTICLE> | ||||
|         <SUBDIV> | ||||
|             <TITLE> | ||||
|                 <TI>Subdivision Title</TI> | ||||
|                 <STI>Subdivision Subtitle</STI> | ||||
|             </TITLE> | ||||
|             <PARAG IDENTIFIER="001.001"> | ||||
|                 <NO.PARAG>1</NO.PARAG> | ||||
|                 <ALINEA>Paragraph 1 text.</ALINEA> | ||||
|             </PARAG> | ||||
|             <COMMENT>Comment text.</COMMENT> | ||||
|             <ALINEA>Alinea text.</ALINEA> | ||||
|             <QUOT.S>Quotation text.</QUOT.S> | ||||
|             <SUBDIV> | ||||
|                 <TITLE> | ||||
|                     <TI>Nested Subdivision</TI> | ||||
|                 </TITLE> | ||||
|                 <ALINEA>Nested alinea.</ALINEA> | ||||
|             </SUBDIV> | ||||
|         </SUBDIV> | ||||
|     </ARTICLE> | ||||
|     """ | ||||
|     parser = ET.XMLParser(remove_blank_text=True) | ||||
|     el = ET.fromstring(xml, parser) | ||||
|     html = converter.convert_article(el) | ||||
|  | ||||
|     # Check that the order of HTML blocks matches the order of elements in the XML tree | ||||
|     idx_title = html.index("Subdivision Title") | ||||
|     idx_parag = html.index('class="paragraph"') | ||||
|     idx_comment = html.index("Comment text.") | ||||
|     idx_alinea = html.index("Alinea text.") | ||||
|     idx_quot = html.index("Quotation text.") | ||||
|     idx_nested = html.index("Nested Subdivision") | ||||
|  | ||||
|     # The order in the XML: title, parag, alinea, comment, quot, nested subdiv | ||||
|     assert idx_title < idx_parag < idx_comment < idx_alinea < idx_quot < idx_nested | ||||
		Reference in New Issue
	
	Block a user