elements to semantic HTML5."""
def __init__(self, language: Language, namespace: Optional[str] = None):
"""
Initialize the converter.
Args:
language: Language object to determine the language for cross-reference extraction
namespace: Optional XML namespace to use when parsing elements
"""
self.ns = namespace
self.language = language
self.ns_prefix = f"{{{namespace}}}" if namespace else ""
def _get_tag(self, tag: str) -> str:
"""Get the tag name with namespace if available."""
return f"{self.ns_prefix}{tag}"
def _get_text(self, element: ET._Element) -> str:
"""Get the text content of an element, including all nested text.
This uses lxml's text_content() method when available, falling back to
manual traversal for other cases.
"""
if element is None:
return ""
# Use lxml's built-in text_content() method which is more efficient
try:
return element.text_content()
except AttributeError:
# Fall back to manual traversal if text_content() is not available
text = element.text or ""
for child in element.iterchildren(tag="*"):
text += self._get_text(child)
if child.tail:
text += child.tail
return text
def _create_id(self, identifier: str) -> str:
"""Create a valid HTML ID from the article identifier."""
# Clean and normalize the identifier for use as an HTML id
clean_id = re.sub(r"[^a-zA-Z0-9-]", "-", identifier)
return f"art-{clean_id}"
def _replace_xref(self, text: str, xref: CrossReference) -> str:
"""Replace a cross-reference instance with semantic markup in the text."""
# Replace the cross-reference text with a link
text = text.replace(
xref.text,
f'{xref.text}',
)
return text
def _convert_btx(self, element: ET._Element) -> str:
"""
Convert basic text elements (t_btx, t_btx.seq) to HTML.
This is a simplified implementation. In a complete version,
this would handle all the possible child elements defined in t_btx.
"""
if element is None:
return ""
result = element.text or ""
is_title = element.tag in ("TI", "STI", "TI.ART", "STI.ART")
if not is_title and not element.getchildren():
# Cross-references should be treated at the deepest level
xrefs = extract_xrefs(element, self.language)
for xref in xrefs:
# Replace the cross-reference text with a link
result = self._replace_xref(result, xref)
for child in element.iterchildren(tag="*"):
child_tag = child.tag.replace(self.ns_prefix, "")
# Process common inline elements
if child_tag == "HT":
# Handle highlighted text with appropriate HTML tags
ht_type = child.get("TYPE", "NORMAL")
if ht_type == "BOLD":
result += f"{self._convert_btx(child)}"
elif ht_type == "ITALIC":
result += f"{self._convert_btx(child)}"
elif ht_type == "SUB":
result += f"{self._convert_btx(child)}"
elif ht_type == "SUP":
result += f"{self._convert_btx(child)}"
elif ht_type == "UNDERLINE":
result += f"{self._convert_btx(child)}"
elif ht_type == "SC": # Small caps
result += f'{self._convert_btx(child)}'
else:
result += self._convert_btx(child)
elif child_tag == "FT":
# Format text (numbers, codes, etc.)
ft_type = child.get("TYPE", "")
if ft_type == "NUMBER" or ft_type == "DECIMAL":
result += (
f'{self._convert_btx(child)}'
)
elif ft_type == "CODE":
result += f"{self._convert_btx(child)}"
else:
result += f'{self._convert_btx(child)}'
elif child_tag == "IE":
# Inclusion/exclusion marker
result += '±'
elif child_tag == "BR":
# Line break
result += "
"
elif child_tag == "P":
# Paragraph
result += f"{self._convert_btx(child)}
"
elif child_tag == "NOTE":
# Note reference
note_id = child.get("NOTE.ID", "")
result += f'{self._convert_btx(child)}'
elif child_tag == "QUOT.START":
# Opening quotation mark
result += "“"
elif child_tag == "QUOT.END":
# Closing quotation mark
result += "”"
elif child_tag == "LIST":
# Handle lists
# Formex styles to CSS list-style-type mapping
list_style_map = {
"ARAB": "decimal",
"ALPHA": "upper-alpha",
"alpha": "lower-alpha",
"ROMAN": "upper-roman",
"roman": "lower-roman",
"BULLET": "disc",
"DASH": "'—'",
"NDASH:": "'–'",
"NONE": "none",
"OTHER": "none",
}
list_type = child.get("TYPE", "BULLET")
list_style_type = list_style_map[list_type]
if list_type == "BULLET":
result += f"{self._convert_list(child)}
"
elif list_type in ["ARAB", "ALPHA", "alpha", "ROMAN", "roman"]:
result += f"{self._convert_list(child)}
"
else:
result += f"{self._convert_list(child)}
"
elif child_tag == "TXT":
# Simple text element
result += html.escape(self._get_text(child))
elif child_tag == "LINK":
# Handle links (added for lxml version)
uri = child.get("URI", "#")
result += f'{self._convert_btx(child)}'
elif child_tag == "REF.DOC.OJ":
# Handle references to OJ documents
coll = child.get("COLL", "")
no_oj = child.get("NO.OJ", "")
date = child.get("DATE.PUB", "")
page = child.get("PAGE.FIRST", "")
result += (
f'{coll} {no_oj}, {date}, p. {page}'
)
else:
# Recursively process other element types
result += self._convert_btx(child)
if child.tail:
xrefs = extract_xrefs(child, self.language)
tail_text = child.tail
for xref in xrefs:
# Replace the cross-reference text with a link
tail_text = self._replace_xref(tail_text, xref)
result += tail_text
return result
def _convert_list(self, list_element: ET._Element) -> str:
"""Convert a Formex LIST element to HTML list items."""
result = ""
# Using lxml's xpath to get direct child ITEM elements
for item in list_element.xpath(f"./{self._get_tag('ITEM')}"):
item_content = ""
# Process ITEM contents which should be either NP or P elements
for child in item:
child_tag = child.tag.replace(self.ns_prefix, "")
if child_tag == "NP":
# Numbered paragraph - extract the number and text
no_p_elems = child.xpath(f"./{self._get_tag('NO.P')}")
txt_elems = child.xpath(f"./{self._get_tag('TXT')}")
no_p = no_p_elems[0] if no_p_elems else None
txt = txt_elems[0] if txt_elems else None
if no_p is not None and txt is not None:
num = self._get_text(no_p)
text = self._get_text(txt)
# Handle cross-references within the text
xrefs = extract_xrefs(txt, self.language)
for xref in xrefs:
text = self._replace_xref(text, xref)
item_content += f'{num} {text}'
elif child_tag == "P":
# Regular paragraph
item_content += self._convert_btx(child)
else:
# Other elements
item_content += self._convert_btx(child)
result += f"{item_content}"
return result
def _convert_alinea(self, alinea: ET._Element) -> str:
"""Convert an ALINEA element to HTML."""
return f'{self._convert_btx(alinea)}
'
def _convert_parag(self, parag: ET._Element) -> str:
"""Convert a PARAG (paragraph) element to HTML."""
identifier = parag.get("IDENTIFIER", "")
parag_id = self._create_id(identifier) if identifier else ""
content = ""
for child in parag.iterchildren(tag="*"):
child_tag = child.tag.replace(self.ns_prefix, "")
if child_tag == "ALINEA":
content += self._convert_alinea(child)
elif child_tag == "COMMENT":
content += f''
elif child_tag == "QUOT.S":
content += f'{self._convert_btx(child)}
'
elif child_tag == "NO.PARAG":
content += (
f'{self._convert_btx(child)}'
)
else:
raise RuntimeError(
f"Unexpected child element '{child_tag}' in PARAG: {text_content(child)}"
)
return f'{content}
'
def _convert_subdiv(self, subdiv: ET._Element) -> str:
"""Convert a SUBDIV (subdivision) element to HTML, preserving child order."""
# Get the title using XPath (should be the first TITLE child if present)
title = ""
title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}")
if title_elems:
title_elem = title_elems[0]
# Process TI (title) and STI (subtitle) elements
ti_elems = title_elem.xpath(f"./{self._get_tag('TI')}")
ti_text = self._convert_btx(ti_elems[0]) if ti_elems else ""
sti_list = []
for sti in title_elem.xpath(f"./{self._get_tag('STI')}"):
sti_list.append(self._convert_btx(sti))
title = f'{ti_text}
'
if sti_list:
title += f'{" ".join(sti_list)}
'
# Process all children in order, skipping TITLE (already handled)
content = ""
for child in subdiv.iterchildren(tag="*"):
child_tag = child.tag.replace(self.ns_prefix, "")
if child_tag == "TITLE":
continue # already handled
elif child_tag == "PARAG":
content += self._convert_parag(child)
elif child_tag == "ALINEA":
content += self._convert_alinea(child)
elif child_tag == "COMMENT":
content += f''
elif child_tag == "QUOT.S":
content += f'{self._convert_btx(child)}
'
elif child_tag == "SUBDIV":
content += self._convert_subdiv(child)
else:
raise RuntimeError(
f"Unexpected child element '{child_tag}' in SUBDIV: {text_content(child)}"
)
return f''
def convert_article(self, article: Union[str, ET._Element]) -> str:
"""
Convert a Formex element to HTML5.
Args:
article: Either an lxml Element or an XML string representing an ARTICLE
Returns:
A string containing the HTML5 representation of the article
"""
# Parse the article if it's a string
if isinstance(article, str):
try:
parser = ET.XMLParser(remove_blank_text=True)
article = cast(
ET._Element, ET.fromstring(article.encode("utf-8"), parser)
)
except ET.XMLSyntaxError as e:
return f"Error parsing XML: {e}
"
# Extract the article identifier
identifier = article.get("IDENTIFIER", "")
article_id = self._create_id(identifier)
# Strip processing instructions
ET.strip_tags(article, lxml.etree.PI)
# Extract the article title
# Use lxml's xpath capabilities for better namespace handling
ti_art = article.xpath(f".//{self._get_tag('TI.ART')}")
ti_art = ti_art[0] if ti_art else None
article_title = self._convert_btx(ti_art) if ti_art is not None else ""
# Extract the article subtitle if present
sti_art = article.xpath(f".//{self._get_tag('STI.ART')}")
sti_art = sti_art[0] if sti_art else None
article_subtitle = self._convert_btx(sti_art) if sti_art is not None else ""
# Build the header section
if article_title and article_subtitle:
header = f'{article_title}
'
if article_subtitle:
header += f'{article_subtitle}
'
header += ""
else:
header = ""
# Process the content based on what's present
content = ""
# Process all child elements (except TITLE) in tree order
for child in article.iterchildren(tag="*"):
child_tag = child.tag.replace(self.ns_prefix, "")
if child_tag in ["TI.ART", "STI.ART"]:
continue # already handled
elif child_tag == "ALINEA":
content += self._convert_alinea(child)
elif child_tag == "PARAG":
content += self._convert_parag(child)
elif child_tag == "COMMENT":
content += f''
elif child_tag == "QUOT.S":
content += f'{self._convert_btx(child)}
'
elif child_tag == "SUBDIV":
content += self._convert_subdiv(child)
else:
raise RuntimeError(
f"Unexpected child element '{child_tag}' in ARTICLE: {text_content(child)}"
)
# Assemble the complete article
return f'{header}{content}
'