Working initial version
This commit is contained in:
		
							
								
								
									
										367
									
								
								src/formex_viewer/formex4.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										367
									
								
								src/formex_viewer/formex4.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,367 @@ | ||||
| import html | ||||
| import re | ||||
| from typing import Optional, Union | ||||
|  | ||||
| import lxml.etree | ||||
| from lxml import etree as ET | ||||
|  | ||||
|  | ||||
| def text_content(el: lxml.etree.Element) -> str: | ||||
|     """Get the text content of an XML element, including all child elements.""" | ||||
|  | ||||
|     def _iterate(el): | ||||
|         for child in el.iter(): | ||||
|             if child.text: | ||||
|                 yield child.text | ||||
|             if child.tail: | ||||
|                 yield child.tail | ||||
|         if el.text: | ||||
|             yield el.text | ||||
|         if el.tail: | ||||
|             yield el.tail | ||||
|  | ||||
|     return "".join(_iterate(el)) | ||||
|  | ||||
|  | ||||
| class FormexArticleConverter: | ||||
|     """Converts Formex XML <ARTICLE> elements to semantic HTML5.""" | ||||
|  | ||||
|     def __init__(self, namespace: Optional[str] = None): | ||||
|         """ | ||||
|         Initialize the converter. | ||||
|  | ||||
|         Args: | ||||
|             namespace: Optional XML namespace to use when parsing elements | ||||
|         """ | ||||
|         self.ns = namespace | ||||
|         self.ns_prefix = f"{{{namespace}}}" if namespace else "" | ||||
|  | ||||
|     def _get_tag(self, tag: str) -> str: | ||||
|         """Get the tag name with namespace if available.""" | ||||
|         return f"{self.ns_prefix}{tag}" | ||||
|  | ||||
|     def _get_text(self, element: ET.Element) -> str: | ||||
|         """Get the text content of an element, including all nested text. | ||||
|  | ||||
|         This uses lxml's text_content() method when available, falling back to | ||||
|         manual traversal for other cases. | ||||
|         """ | ||||
|         if element is None: | ||||
|             return "" | ||||
|  | ||||
|         # Use lxml's built-in text_content() method which is more efficient | ||||
|         try: | ||||
|             return element.text_content() | ||||
|         except AttributeError: | ||||
|             # Fall back to manual traversal if text_content() is not available | ||||
|             text = element.text or "" | ||||
|             for child in element: | ||||
|                 text += self._get_text(child) | ||||
|                 if child.tail: | ||||
|                     text += child.tail | ||||
|             return text | ||||
|  | ||||
|     def _create_id(self, identifier: str) -> str: | ||||
|         """Create a valid HTML ID from the article identifier.""" | ||||
|         # Clean and normalize the identifier for use as an HTML id | ||||
|         clean_id = re.sub(r"[^a-zA-Z0-9-]", "-", identifier) | ||||
|         return f"art-{clean_id}" | ||||
|  | ||||
|     def _convert_btx(self, element: ET.Element) -> str: | ||||
|         """ | ||||
|         Convert basic text elements (t_btx, t_btx.seq) to HTML. | ||||
|  | ||||
|         This is a simplified implementation. In a complete version, | ||||
|         this would handle all the possible child elements defined in t_btx. | ||||
|         """ | ||||
|         if element is None: | ||||
|             return "" | ||||
|  | ||||
|         result = element.text or "" | ||||
|  | ||||
|         for child in element: | ||||
|             child_tag = child.tag.replace(self.ns_prefix, "") | ||||
|  | ||||
|             # Process common inline elements | ||||
|             if child_tag == "HT": | ||||
|                 # Handle highlighted text with appropriate HTML tags | ||||
|                 ht_type = child.get("TYPE", "NORMAL") | ||||
|                 if ht_type == "BOLD": | ||||
|                     result += f"<strong>{self._convert_btx(child)}</strong>" | ||||
|                 elif ht_type == "ITALIC": | ||||
|                     result += f"<em>{self._convert_btx(child)}</em>" | ||||
|                 elif ht_type == "SUB": | ||||
|                     result += f"<sub>{self._convert_btx(child)}</sub>" | ||||
|                 elif ht_type == "SUP": | ||||
|                     result += f"<sup>{self._convert_btx(child)}</sup>" | ||||
|                 elif ht_type == "UNDERLINE": | ||||
|                     result += f"<u>{self._convert_btx(child)}</u>" | ||||
|                 elif ht_type == "SC":  # Small caps | ||||
|                     result += f'<span style="font-variant: small-caps">{self._convert_btx(child)}</span>' | ||||
|                 else: | ||||
|                     result += self._convert_btx(child) | ||||
|             elif child_tag == "FT": | ||||
|                 # Format text (numbers, codes, etc.) | ||||
|                 ft_type = child.get("TYPE", "") | ||||
|                 if ft_type == "NUMBER" or ft_type == "DECIMAL": | ||||
|                     result += ( | ||||
|                         f'<span class="ft-number">{self._convert_btx(child)}</span>' | ||||
|                     ) | ||||
|                 elif ft_type == "CODE": | ||||
|                     result += f"<code>{self._convert_btx(child)}</code>" | ||||
|                 else: | ||||
|                     result += f'<span class="ft-{ft_type.lower()}">{self._convert_btx(child)}</span>' | ||||
|             elif child_tag == "IE": | ||||
|                 # Inclusion/exclusion marker | ||||
|                 result += '<span class="ie-marker">±</span>' | ||||
|             elif child_tag == "BR": | ||||
|                 # Line break | ||||
|                 result += "<br>" | ||||
|             elif child_tag == "P": | ||||
|                 # Paragraph | ||||
|                 result += f"<p>{self._convert_btx(child)}</p>" | ||||
|             elif child_tag == "NOTE": | ||||
|                 # Note reference | ||||
|                 note_id = child.get("NOTE.ID", "") | ||||
|                 result += f'<sup class="note-ref" id="{note_id}">{self._convert_btx(child)}</sup>' | ||||
|             elif child_tag == "QUOT.START": | ||||
|                 # Opening quotation mark | ||||
|                 result += "“" | ||||
|             elif child_tag == "QUOT.END": | ||||
|                 # Closing quotation mark | ||||
|                 result += "”" | ||||
|             elif child_tag == "LIST": | ||||
|                 # Handle lists | ||||
|  | ||||
|                 # Formex styles to CSS list-style-type mapping | ||||
|                 list_style_map = { | ||||
|                     "ARAB": "decimal", | ||||
|                     "ALPHA": "upper-alpha", | ||||
|                     "alpha": "lower-alpha", | ||||
|                     "ROMAN": "upper-roman", | ||||
|                     "roman": "lower-roman", | ||||
|                     "BULLET": "disc", | ||||
|                     "DASH": "'—'", | ||||
|                     "NDASH:": "'–'", | ||||
|                     "NONE": "none", | ||||
|                     "OTHER": "none", | ||||
|                 } | ||||
|  | ||||
|                 list_type = child.get("TYPE", "BULLET") | ||||
|                 list_style_type = list_style_map[list_type] | ||||
|                 if list_type == "BULLET": | ||||
|                     result += f"<ul>{self._convert_list(child)}</ul>" | ||||
|                 elif list_type in ["ARAB", "ALPHA", "alpha", "ROMAN", "roman"]: | ||||
|                     result += f"<ol class='list-{list_style_type}'>{self._convert_list(child)}</ol>" | ||||
|                 else: | ||||
|                     result += f"<ul class='list-{list_style_type}'>{self._convert_list(child)}</ul>" | ||||
|             elif child_tag == "TXT": | ||||
|                 # Simple text element | ||||
|                 result += html.escape(self._get_text(child)) | ||||
|             elif child_tag == "LINK": | ||||
|                 # Handle links (added for lxml version) | ||||
|                 uri = child.get("URI", "#") | ||||
|                 result += f'<a href="{uri}">{self._convert_btx(child)}</a>' | ||||
|             elif child_tag == "REF.DOC.OJ": | ||||
|                 # Handle references to OJ documents | ||||
|                 coll = child.get("COLL", "") | ||||
|                 no_oj = child.get("NO.OJ", "") | ||||
|                 date = child.get("DATE.PUB", "") | ||||
|                 page = child.get("PAGE.FIRST", "") | ||||
|                 result += ( | ||||
|                     f'<span class="ref-oj">{coll} {no_oj}, {date}, p. {page}</span>' | ||||
|                 ) | ||||
|             else: | ||||
|                 # Recursively process other element types | ||||
|                 result += self._convert_btx(child) | ||||
|  | ||||
|             if child.tail: | ||||
|                 result += child.tail | ||||
|  | ||||
|         return result | ||||
|  | ||||
|     def _convert_list(self, list_element: ET.Element) -> str: | ||||
|         """Convert a Formex LIST element to HTML list items.""" | ||||
|         result = "" | ||||
|         # Using lxml's xpath to get direct child ITEM elements | ||||
|         for item in list_element.xpath(f"./{self._get_tag('ITEM')}"): | ||||
|             item_content = "" | ||||
|             # Process ITEM contents which should be either NP or P elements | ||||
|             for child in item: | ||||
|                 child_tag = child.tag.replace(self.ns_prefix, "") | ||||
|                 if child_tag == "NP": | ||||
|                     # Numbered paragraph - extract the number and text | ||||
|                     no_p_elems = child.xpath(f"./{self._get_tag('NO.P')}") | ||||
|                     txt_elems = child.xpath(f"./{self._get_tag('TXT')}") | ||||
|  | ||||
|                     no_p = no_p_elems[0] if no_p_elems else None | ||||
|                     txt = txt_elems[0] if txt_elems else None | ||||
|  | ||||
|                     if no_p is not None and txt is not None: | ||||
|                         num = self._get_text(no_p) | ||||
|                         text = self._get_text(txt) | ||||
|                         item_content += f'<span class="item-number">{num}</span> {text}' | ||||
|                 elif child_tag == "P": | ||||
|                     # Regular paragraph | ||||
|                     item_content += self._convert_btx(child) | ||||
|                 else: | ||||
|                     # Other elements | ||||
|                     item_content += self._convert_btx(child) | ||||
|  | ||||
|             result += f"<li>{item_content}</li>" | ||||
|  | ||||
|         return result | ||||
|  | ||||
|     def _convert_alinea(self, alinea: ET.Element) -> str: | ||||
|         """Convert an ALINEA element to HTML.""" | ||||
|         return f'<p class="alinea">{self._convert_btx(alinea)}</p>' | ||||
|  | ||||
|     def _convert_parag(self, parag: ET.Element) -> str: | ||||
|         """Convert a PARAG (paragraph) element to HTML.""" | ||||
|         identifier = parag.get("IDENTIFIER", "") | ||||
|         parag_id = self._create_id(identifier) if identifier else "" | ||||
|  | ||||
|         # Get the paragraph number using XPath | ||||
|         no_parag_elems = parag.xpath(f"./{self._get_tag('NO.PARAG')}") | ||||
|         parag_num = self._get_text(no_parag_elems[0]) if no_parag_elems else "" | ||||
|  | ||||
|         # Process the alineas within the paragraph | ||||
|         content = "" | ||||
|         for alinea in parag.xpath(f"./{self._get_tag('ALINEA')}"): | ||||
|             content += self._convert_alinea(alinea) | ||||
|  | ||||
|         # Process any comments | ||||
|         for comment in parag.xpath(f"./{self._get_tag('COMMENT')}"): | ||||
|             content += f'<div class="comment">{self._convert_btx(comment)}</div>' | ||||
|  | ||||
|         # Process any quotations | ||||
|         for quot in parag.xpath(f"./{self._get_tag('QUOT.S')}"): | ||||
|             content += ( | ||||
|                 f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>' | ||||
|             ) | ||||
|  | ||||
|         return f'<div class="paragraph" data-paragraph-id="{parag_id}"><span class="paragraph-number">{parag_num}</span>{content}</div>' | ||||
|  | ||||
|     def _convert_subdiv(self, subdiv: ET.Element) -> str: | ||||
|         """Convert a SUBDIV (subdivision) element to HTML.""" | ||||
|         # Get the title using XPath | ||||
|         title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}") | ||||
|         title = "" | ||||
|         if title_elems: | ||||
|             title_elem = title_elems[0] | ||||
|             # Process TI (title) and STI (subtitle) elements | ||||
|             ti_elems = title_elem.xpath(f"./{self._get_tag('TI')}") | ||||
|             ti_text = self._convert_btx(ti_elems[0]) if ti_elems else "" | ||||
|  | ||||
|             sti_list = [] | ||||
|             for sti in title_elem.xpath(f"./{self._get_tag('STI')}"): | ||||
|                 sti_list.append(self._convert_btx(sti)) | ||||
|  | ||||
|             title = f'<h4 class="subdivision-title">{ti_text}</h4>' | ||||
|             if sti_list: | ||||
|                 title += f'<h5 class="subdivision-subtitle">{" ".join(sti_list)}</h5>' | ||||
|  | ||||
|         # Process content: either paragraphs, alineas, or nested subdivisions | ||||
|         content = "" | ||||
|  | ||||
|         # Process paragraphs directly under this subdivision | ||||
|         for parag in subdiv.xpath(f"./{self._get_tag('PARAG')}"): | ||||
|             content += self._convert_parag(parag) | ||||
|  | ||||
|         # Process alineas directly under this subdivision | ||||
|         for alinea in subdiv.xpath(f"./{self._get_tag('ALINEA')}"): | ||||
|             content += self._convert_alinea(alinea) | ||||
|  | ||||
|         # Process comments directly under this subdivision | ||||
|         for comment in subdiv.xpath(f"./{self._get_tag('COMMENT')}"): | ||||
|             content += f'<div class="comment">{self._convert_btx(comment)}</div>' | ||||
|  | ||||
|         # Process quotations directly under this subdivision | ||||
|         for quot in subdiv.xpath(f"./{self._get_tag('QUOT.S')}"): | ||||
|             content += ( | ||||
|                 f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>' | ||||
|             ) | ||||
|  | ||||
|         # Process nested subdivisions directly under this subdivision | ||||
|         for sub in subdiv.xpath(f"./{self._get_tag('SUBDIV')}"): | ||||
|             content += self._convert_subdiv(sub) | ||||
|  | ||||
|         return f'<section class="subdivision">{title}{content}</section>' | ||||
|  | ||||
|     def convert_article(self, article: Union[str, ET.Element]) -> str: | ||||
|         """ | ||||
|         Convert a Formex <ARTICLE> element to HTML5. | ||||
|  | ||||
|         Args: | ||||
|             article: Either an lxml Element or an XML string representing an ARTICLE | ||||
|  | ||||
|         Returns: | ||||
|             A string containing the HTML5 representation of the article | ||||
|         """ | ||||
|         # Parse the article if it's a string | ||||
|         if isinstance(article, str): | ||||
|             try: | ||||
|                 parser = ET.XMLParser(remove_blank_text=True) | ||||
|                 article = ET.fromstring(article.encode("utf-8"), parser) | ||||
|             except ET.XMLSyntaxError as e: | ||||
|                 return f"<p>Error parsing XML: {e}</p>" | ||||
|  | ||||
|         # Extract the article identifier | ||||
|         identifier = article.get("IDENTIFIER", "") | ||||
|         article_id = self._create_id(identifier) | ||||
|  | ||||
|         # Strip processing instructions | ||||
|         ET.strip_tags(article, lxml.etree.PI) | ||||
|  | ||||
|         # Extract the article title | ||||
|         # Use lxml's xpath capabilities for better namespace handling | ||||
|         ti_art = article.xpath(f".//{self._get_tag('TI.ART')}") | ||||
|         ti_art = ti_art[0] if ti_art else None | ||||
|         article_title = self._convert_btx(ti_art) if ti_art is not None else "" | ||||
|  | ||||
|         # Extract the article subtitle if present | ||||
|         sti_art = article.xpath(f".//{self._get_tag('STI.ART')}") | ||||
|         sti_art = sti_art[0] if sti_art else None | ||||
|         article_subtitle = self._convert_btx(sti_art) if sti_art is not None else "" | ||||
|  | ||||
|         # Build the header section | ||||
|         header = f'<header><h3 class="article-title">{article_title}</h3>' | ||||
|         if article_subtitle: | ||||
|             header += f'<h4 class="article-subtitle">{article_subtitle}</h4>' | ||||
|         header += "</header>" | ||||
|  | ||||
|         # Process the content based on what's present | ||||
|         content = "" | ||||
|  | ||||
|         # Check if we have alineas directly under the article | ||||
|         alineas = article.xpath(f"./{self._get_tag('ALINEA')}") | ||||
|         if alineas: | ||||
|             for alinea in alineas: | ||||
|                 content += self._convert_alinea(alinea) | ||||
|  | ||||
|         # Check if we have paragraphs directly under the article | ||||
|         parags = article.xpath(f"./{self._get_tag('PARAG')}") | ||||
|         if parags: | ||||
|             for parag in parags: | ||||
|                 content += self._convert_parag(parag) | ||||
|  | ||||
|         # Check for comments directly under the article | ||||
|         comments = article.xpath(f"./{self._get_tag('COMMENT')}") | ||||
|         if comments: | ||||
|             for comment in comments: | ||||
|                 content += f'<div class="comment">{self._convert_btx(comment)}</div>' | ||||
|  | ||||
|         # Check for quotations directly under the article | ||||
|         quots = article.xpath(f"./{self._get_tag('QUOT.S')}") | ||||
|         if quots: | ||||
|             for quot in quots: | ||||
|                 content += f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>' | ||||
|  | ||||
|         # Check for subdivisions directly under the article | ||||
|         subdivs = article.xpath(f"./{self._get_tag('SUBDIV')}") | ||||
|         if subdivs: | ||||
|             for subdiv in subdivs: | ||||
|                 content += self._convert_subdiv(subdiv) | ||||
|  | ||||
|         # Assemble the complete article | ||||
|         return f'<article id="{article_id}" class="formex-article">{header}<div class="article-content">{content}</div></article>' | ||||
		Reference in New Issue
	
	Block a user