formex-viewer/src/formex_viewer/formex4.py

import html
import re
import warnings
from dataclasses import dataclass
from typing import Literal, Optional, Union, cast

import lxml.etree
from lxml import etree as ET

from formex_viewer.main import Language


def text_content(el: ET._Element) -> str:
    """Get the text content of an XML element, including all child elements."""

    def _iterate(el):
        for child in el.iter():
            if child.text:
                yield child.text
            if child.tail:
                yield child.tail
        if el.text:
            yield el.text
        if el.tail:
            yield el.tail

    return "".join(_iterate(el))


@dataclass
class CrossReference:
    target: Literal["article", "annex"]
    text: str
    id: str
    paragraph: int | None = None


def extract_xrefs(el: ET._Element, language: Language) -> list[CrossReference]:
    """Extract cross-references from an XML element.

    Args:
        el: The XML element to extract cross-references from.

    Returns:
        A dictionary with cross-reference IDs as keys and their text content as values.
    """
    crossrefs = []
    text = text_content(el)

    PATTERN_PARTS = {
        Language.ENG: {
            "article": r"(Art\.|Articles?)",
            "annex": r"(Ann\.|Annex)",
            "exclusion": r"(?! of(?! this))",
        },
        Language.DEU: {
            "article": r"(Art\.|Artikels?)",
            "annex": r"(Anhang)",
            "exclusion": r"(?! von)",
        },
    }

    if language not in PATTERN_PARTS:
        warnings.warn(
            f"Language '{language}' not supported for cross-reference extraction. Returning empty list."
        )
        return []

    # Prevent zealous matching of references to other texts by using a negative lookahead
    # Also, match only at word boundaries to prevent partial matches
    parts = PATTERN_PARTS[language]
    patterns = {
        "article": rf"\b{parts["article"]}\s+(?P<art_num>\d+)(?:[(](?P<parag_num>\d+)[)])?(?:{parts["exclusion"]})",
        "annex": rf"\b{parts["annex"]}\s+(?P<annex_num>[DILMVX]+)(?:{parts["exclusion"]})",
    }
    for key, pattern in patterns.items():
        matches = re.finditer(pattern, text, flags=re.IGNORECASE)
        for match in matches:
            crossref_id = (
                match.group("art_num") if key == "article" else match.group("annex_num")
            )
            parag_num = match.groupdict().get("parag_num")

            if not parag_num or key not in ["article", "annex"]:
                raise RuntimeError()

            crossref_text = match.group(0)
            crossrefs.append(
                CrossReference(
                    target=key,
                    id=crossref_id,
                    paragraph=int(parag_num),
                    text=crossref_text,
                )
            )
    return crossrefs


def extract_article(doc: ET._Element, article_id: int) -> ET._Element | None:
    """Extract a specific article from a Formex document.

    Args:
        doc: The XML document to extract from.
        article_id: The article number.

    Returns:
        The extracted article element.
    """

    # Use XPath to find the specific article
    xpath = f".//ARTICLE[@IDENTIFIER='{article_id:03d}']"
    return doc.xpath(xpath)[0] if doc.xpath(xpath) else None


def extract_paragraph(
    doc: ET._Element, article_id: int, paragraph_id: int
) -> ET._Element | None:
    """Extract a specific paragraph from an article in a Formex document.

    Args:
        doc: The XML document to extract from.
        article_id: The article number.
        paragraph_id: The paragraph number.

    Returns:
        The extracted paragraph element.
    """

    # Use XPath to find the specific paragraph
    xpath = f".//PARAG[@IDENTIFIER='{article_id:03d}.{paragraph_id:03d}']"
    return doc.xpath(xpath)[0] if doc.xpath(xpath) else None


class FormexArticleConverter:
    """Converts Formex XML <ARTICLE> elements to semantic HTML5."""

    def __init__(self, language: Language, namespace: Optional[str] = None):
        """
        Initialize the converter.

        Args:
            language: Language object to determine the language for cross-reference extraction
            namespace: Optional XML namespace to use when parsing elements
        """
        self.ns = namespace
        self.language = language
        self.ns_prefix = f"{{{namespace}}}" if namespace else ""

    def _get_tag(self, tag: str) -> str:
        """Get the tag name with namespace if available."""
        return f"{self.ns_prefix}{tag}"

    def _get_text(self, element: ET._Element) -> str:
        """Get the text content of an element, including all nested text.

        This uses lxml's text_content() method when available, falling back to
        manual traversal for other cases.
        """
        if element is None:
            return ""

        # Use lxml's built-in text_content() method which is more efficient
        try:
            return element.text_content()
        except AttributeError:
            # Fall back to manual traversal if text_content() is not available
            text = element.text or ""
            for child in element.iterchildren(tag="*"):
                text += self._get_text(child)
                if child.tail:
                    text += child.tail
            return text

    def _create_id(self, identifier: str) -> str:
        """Create a valid HTML ID from the article identifier."""
        # Clean and normalize the identifier for use as an HTML id
        clean_id = re.sub(r"[^a-zA-Z0-9-]", "-", identifier)
        return f"art-{clean_id}"

    def _replace_xref(self, text: str, xref: CrossReference) -> str:
        """Replace a cross-reference instance with semantic markup in the text."""
        # Replace the cross-reference text with a link
        text = text.replace(
            xref.text,
            f'<a href="" data-target="{xref.target}" data-id="{xref.id}" data-paragraph-id="{xref.paragraph or ''}" class="cross-ref">{xref.text}</a>',
        )
        return text

    def _convert_btx(self, element: ET._Element) -> str:
        """
        Convert basic text elements (t_btx, t_btx.seq) to HTML.

        This is a simplified implementation. In a complete version,
        this would handle all the possible child elements defined in t_btx.
        """
        if element is None:
            return ""

        result = element.text or ""

        is_title = element.tag in ("TI", "STI", "TI.ART", "STI.ART")
        if not is_title and not element.getchildren():
            # Cross-references should be treated at the deepest level
            xrefs = extract_xrefs(element, self.language)
            for xref in xrefs:
                # Replace the cross-reference text with a link
                result = self._replace_xref(result, xref)

        for child in element.iterchildren(tag="*"):
            child_tag = child.tag.replace(self.ns_prefix, "")

            # Process common inline elements
            if child_tag == "HT":
                # Handle highlighted text with appropriate HTML tags
                ht_type = child.get("TYPE", "NORMAL")
                if ht_type == "BOLD":
                    result += f"<strong>{self._convert_btx(child)}</strong>"
                elif ht_type == "ITALIC":
                    result += f"<em>{self._convert_btx(child)}</em>"
                elif ht_type == "SUB":
                    result += f"<sub>{self._convert_btx(child)}</sub>"
                elif ht_type == "SUP":
                    result += f"<sup>{self._convert_btx(child)}</sup>"
                elif ht_type == "UNDERLINE":
                    result += f"<u>{self._convert_btx(child)}</u>"
                elif ht_type == "SC":  # Small caps
                    result += f'<span style="font-variant: small-caps">{self._convert_btx(child)}</span>'
                else:
                    result += self._convert_btx(child)
            elif child_tag == "FT":
                # Format text (numbers, codes, etc.)
                ft_type = child.get("TYPE", "")
                if ft_type == "NUMBER" or ft_type == "DECIMAL":
                    result += (
                        f'<span class="ft-number">{self._convert_btx(child)}</span>'
                    )
                elif ft_type == "CODE":
                    result += f"<code>{self._convert_btx(child)}</code>"
                else:
                    result += f'<span class="ft-{ft_type.lower()}">{self._convert_btx(child)}</span>'
            elif child_tag == "IE":
                # Inclusion/exclusion marker
                result += '<span class="ie-marker">±</span>'
            elif child_tag == "BR":
                # Line break
                result += "<br>"
            elif child_tag == "P":
                # Paragraph
                result += f"<p>{self._convert_btx(child)}</p>"
            elif child_tag == "NOTE":
                # Note reference
                note_id = child.get("NOTE.ID", "")
                result += f'<sup class="note-ref" id="{note_id}">{self._convert_btx(child)}</sup>'
            elif child_tag == "QUOT.START":
                # Opening quotation mark
                result += "&ldquo;"
            elif child_tag == "QUOT.END":
                # Closing quotation mark
                result += "&rdquo;"
            elif child_tag == "LIST":
                # Handle lists

                # Formex styles to CSS list-style-type mapping
                list_style_map = {
                    "ARAB": "decimal",
                    "ALPHA": "upper-alpha",
                    "alpha": "lower-alpha",
                    "ROMAN": "upper-roman",
                    "roman": "lower-roman",
                    "BULLET": "disc",
                    "DASH": "'—'",
                    "NDASH:": "'–'",
                    "NONE": "none",
                    "OTHER": "none",
                }

                list_type = child.get("TYPE", "BULLET")
                list_style_type = list_style_map[list_type]
                if list_type == "BULLET":
                    result += f"<ul>{self._convert_list(child)}</ul>"
                elif list_type in ["ARAB", "ALPHA", "alpha", "ROMAN", "roman"]:
                    result += f"<ol class='list-{list_style_type}'>{self._convert_list(child)}</ol>"
                else:
                    result += f"<ul class='list-{list_style_type}'>{self._convert_list(child)}</ul>"
            elif child_tag == "TXT":
                # Simple text element
                result += html.escape(self._get_text(child))
            elif child_tag == "LINK":
                # Handle links (added for lxml version)
                uri = child.get("URI", "#")
                result += f'<a href="{uri}">{self._convert_btx(child)}</a>'
            elif child_tag == "REF.DOC.OJ":
                # Handle references to OJ documents
                coll = child.get("COLL", "")
                no_oj = child.get("NO.OJ", "")
                date = child.get("DATE.PUB", "")
                page = child.get("PAGE.FIRST", "")
                result += (
                    f'<span class="ref-oj">{coll} {no_oj}, {date}, p. {page}</span>'
                )
            else:
                # Recursively process other element types
                result += self._convert_btx(child)

            if child.tail:
                xrefs = extract_xrefs(child, self.language)
                tail_text = child.tail
                for xref in xrefs:
                    # Replace the cross-reference text with a link
                    tail_text = self._replace_xref(tail_text, xref)

                result += tail_text

        return result

    def _convert_list(self, list_element: ET._Element) -> str:
        """Convert a Formex LIST element to HTML list items."""
        result = ""
        # Using lxml's xpath to get direct child ITEM elements
        for item in list_element.xpath(f"./{self._get_tag('ITEM')}"):
            item_content = ""
            # Process ITEM contents which should be either NP or P elements
            for child in item:
                child_tag = child.tag.replace(self.ns_prefix, "")
                if child_tag == "NP":
                    # Numbered paragraph - extract the number and text
                    no_p_elems = child.xpath(f"./{self._get_tag('NO.P')}")
                    txt_elems = child.xpath(f"./{self._get_tag('TXT')}")

                    no_p = no_p_elems[0] if no_p_elems else None
                    txt = txt_elems[0] if txt_elems else None

                    if no_p is not None and txt is not None:
                        num = self._get_text(no_p)
                        text = self._get_text(txt)

                        # Handle cross-references within the text
                        xrefs = extract_xrefs(txt, self.language)
                        for xref in xrefs:
                            text = self._replace_xref(text, xref)

                        item_content += f'<span class="item-number">{num}</span> {text}'
                elif child_tag == "P":
                    # Regular paragraph
                    item_content += self._convert_btx(child)
                else:
                    # Other elements
                    item_content += self._convert_btx(child)

            result += f"<li>{item_content}</li>"

        return result

    def _convert_alinea(self, alinea: ET._Element) -> str:
        """Convert an ALINEA element to HTML."""
        return f'<p class="alinea">{self._convert_btx(alinea)}</p>'

    def _convert_parag(self, parag: ET._Element) -> str:
        """Convert a PARAG (paragraph) element to HTML."""
        identifier = parag.get("IDENTIFIER", "")
        parag_id = self._create_id(identifier) if identifier else ""

        # Get the paragraph number using XPath
        no_parag_elems = parag.xpath(f"./{self._get_tag('NO.PARAG')}")
        parag_num = self._get_text(no_parag_elems[0]) if no_parag_elems else ""

        # Process the alineas within the paragraph
        content = ""
        for alinea in parag.xpath(f"./{self._get_tag('ALINEA')}"):
            content += self._convert_alinea(alinea)

        # Process any comments
        for comment in parag.xpath(f"./{self._get_tag('COMMENT')}"):
            content += f'<div class="comment">{self._convert_btx(comment)}</div>'

        # Process any quotations
        for quot in parag.xpath(f"./{self._get_tag('QUOT.S')}"):
            content += (
                f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>'
            )

        return f'<div class="paragraph" data-paragraph-id="{parag_id}"><span class="paragraph-number">{parag_num}</span>{content}</div>'

    def _convert_subdiv(self, subdiv: ET._Element) -> str:
        """Convert a SUBDIV (subdivision) element to HTML."""
        # Get the title using XPath
        title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}")
        title = ""
        if title_elems:
            title_elem = title_elems[0]
            # Process TI (title) and STI (subtitle) elements
            ti_elems = title_elem.xpath(f"./{self._get_tag('TI')}")
            ti_text = self._convert_btx(ti_elems[0]) if ti_elems else ""

            sti_list = []
            for sti in title_elem.xpath(f"./{self._get_tag('STI')}"):
                sti_list.append(self._convert_btx(sti))

            title = f'<h4 class="subdivision-title">{ti_text}</h4>'
            if sti_list:
                title += f'<h5 class="subdivision-subtitle">{" ".join(sti_list)}</h5>'

        # Process content: either paragraphs, alineas, or nested subdivisions
        content = ""

        # Process paragraphs directly under this subdivision
        for parag in subdiv.xpath(f"./{self._get_tag('PARAG')}"):
            content += self._convert_parag(parag)

        # Process alineas directly under this subdivision
        for alinea in subdiv.xpath(f"./{self._get_tag('ALINEA')}"):
            content += self._convert_alinea(alinea)

        # Process comments directly under this subdivision
        for comment in subdiv.xpath(f"./{self._get_tag('COMMENT')}"):
            content += f'<div class="comment">{self._convert_btx(comment)}</div>'

        # Process quotations directly under this subdivision
        for quot in subdiv.xpath(f"./{self._get_tag('QUOT.S')}"):
            content += (
                f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>'
            )

        # Process nested subdivisions directly under this subdivision
        for sub in subdiv.xpath(f"./{self._get_tag('SUBDIV')}"):
            content += self._convert_subdiv(sub)

        return f'<section class="subdivision">{title}{content}</section>'

    def convert_article(self, article: Union[str, ET._Element]) -> str:
        """
        Convert a Formex <ARTICLE> element to HTML5.

        Args:
            article: Either an lxml Element or an XML string representing an ARTICLE

        Returns:
            A string containing the HTML5 representation of the article
        """
        # Parse the article if it's a string
        if isinstance(article, str):
            try:
                parser = ET.XMLParser(remove_blank_text=True)
                article = cast(
                    ET._Element, ET.fromstring(article.encode("utf-8"), parser)
                )
            except ET.XMLSyntaxError as e:
                return f"<p>Error parsing XML: {e}</p>"

        # Extract the article identifier
        identifier = article.get("IDENTIFIER", "")
        article_id = self._create_id(identifier)

        # Strip processing instructions
        ET.strip_tags(article, lxml.etree.PI)

        # Extract the article title
        # Use lxml's xpath capabilities for better namespace handling
        ti_art = article.xpath(f".//{self._get_tag('TI.ART')}")
        ti_art = ti_art[0] if ti_art else None
        article_title = self._convert_btx(ti_art) if ti_art is not None else ""

        # Extract the article subtitle if present
        sti_art = article.xpath(f".//{self._get_tag('STI.ART')}")
        sti_art = sti_art[0] if sti_art else None
        article_subtitle = self._convert_btx(sti_art) if sti_art is not None else ""

        # Build the header section
        if article_title and article_subtitle:
            header = f'<header><h3 class="article-title">{article_title}</h3>'
            if article_subtitle:
                header += f'<h4 class="article-subtitle">{article_subtitle}</h4>'
            header += "</header>"
        else:
            header = ""

        # Process the content based on what's present
        content = ""

        # Check if we have alineas directly under the article
        alineas = article.xpath(f"./{self._get_tag('ALINEA')}")
        if alineas:
            for alinea in alineas:
                content += self._convert_alinea(alinea)

        # Check if we have paragraphs directly under the article
        parags = article.xpath(f"./{self._get_tag('PARAG')}")
        if parags:
            for parag in parags:
                content += self._convert_parag(parag)

        # Check for comments directly under the article
        comments = article.xpath(f"./{self._get_tag('COMMENT')}")
        if comments:
            for comment in comments:
                content += f'<div class="comment">{self._convert_btx(comment)}</div>'

        # Check for quotations directly under the article
        quots = article.xpath(f"./{self._get_tag('QUOT.S')}")
        if quots:
            for quot in quots:
                content += f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>'

        # Check for subdivisions directly under the article
        subdivs = article.xpath(f"./{self._get_tag('SUBDIV')}")
        if subdivs:
            for subdiv in subdivs:
                content += self._convert_subdiv(subdiv)

        # Assemble the complete article
        return f'<article id="{article_id}" class="formex-article">{header}<div class="article-content">{content}</div></article>'