Article cross-reference linking
This commit is contained in:
		| @@ -25,6 +25,23 @@ function Panel({ language }: PanelProps) { | |||||||
|     const articleElement = articleRef.current; |     const articleElement = articleRef.current; | ||||||
|     if (!articleElement) return; |     if (!articleElement) return; | ||||||
|  |  | ||||||
|  |     // Replace cross-reference links with page navigation | ||||||
|  |     const crossRefs = articleElement.querySelectorAll( | ||||||
|  |       "a.cross-ref" | ||||||
|  |     ) as NodeListOf<HTMLAnchorElement>; | ||||||
|  |     crossRefs.forEach((link) => { | ||||||
|  |       const target = link.getAttribute("data-target"); | ||||||
|  |       const targetId = link.getAttribute("data-id"); | ||||||
|  |  | ||||||
|  |       if (target && targetId) { | ||||||
|  |         if (target === "article") { | ||||||
|  |           link.setAttribute("href", `../articles/${targetId}`); | ||||||
|  |         } | ||||||
|  |       } else { | ||||||
|  |         console.warn("No target or ID found for link:", link); | ||||||
|  |       } | ||||||
|  |     }); | ||||||
|  |  | ||||||
|     const paragraphs = articleElement.querySelectorAll(".paragraph"); |     const paragraphs = articleElement.querySelectorAll(".paragraph"); | ||||||
|  |  | ||||||
|     // Highlight the selected paragraph |     // Highlight the selected paragraph | ||||||
|   | |||||||
| @@ -1,10 +1,14 @@ | |||||||
| import html | import html | ||||||
| import re | import re | ||||||
|  | import warnings | ||||||
|  | from dataclasses import dataclass | ||||||
| from typing import Optional, Union | from typing import Optional, Union | ||||||
|  |  | ||||||
| import lxml.etree | import lxml.etree | ||||||
| from lxml import etree as ET | from lxml import etree as ET | ||||||
|  |  | ||||||
|  | from formex_viewer.main import Language | ||||||
|  |  | ||||||
|  |  | ||||||
| def text_content(el: lxml.etree.Element) -> str: | def text_content(el: lxml.etree.Element) -> str: | ||||||
|     """Get the text content of an XML element, including all child elements.""" |     """Get the text content of an XML element, including all child elements.""" | ||||||
| @@ -23,17 +27,79 @@ def text_content(el: lxml.etree.Element) -> str: | |||||||
|     return "".join(_iterate(el)) |     return "".join(_iterate(el)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @dataclass | ||||||
|  | class CrossReference: | ||||||
|  |     id: str | ||||||
|  |     text: str | ||||||
|  |     target: str | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossReference]: | ||||||
|  |     """Extract cross-references from an XML element. | ||||||
|  |  | ||||||
|  |     Args: | ||||||
|  |         el: The XML element to extract cross-references from. | ||||||
|  |  | ||||||
|  |     Returns: | ||||||
|  |         A dictionary with cross-reference IDs as keys and their text content as values. | ||||||
|  |     """ | ||||||
|  |     crossrefs = [] | ||||||
|  |     text = text_content(el) | ||||||
|  |  | ||||||
|  |     PATTERN_PARTS = { | ||||||
|  |         Language.ENG: { | ||||||
|  |             "article": r"(Art\.|Articles?)", | ||||||
|  |             "annex": r"(Ann\.|Annex)", | ||||||
|  |             "exclusion": r"(?! of(?! this))", | ||||||
|  |         }, | ||||||
|  |         Language.DEU: { | ||||||
|  |             "article": r"(Art\.|Artikels?)", | ||||||
|  |             "annex": r"(Anhang)", | ||||||
|  |             "exclusion": r"(?! von)", | ||||||
|  |         }, | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     if language not in PATTERN_PARTS: | ||||||
|  |         warnings.warn( | ||||||
|  |             f"Language '{language}' not supported for cross-reference extraction. Returning empty list." | ||||||
|  |         ) | ||||||
|  |         return [] | ||||||
|  |  | ||||||
|  |     # Prevent zealous matching of references to other texts by using a negative lookahead | ||||||
|  |     # Also, match only at word boundaries to prevent partial matches | ||||||
|  |     parts = PATTERN_PARTS[language] | ||||||
|  |     patterns = { | ||||||
|  |         "article": rf"\b{parts["article"]}\s+(?P<art_num>\d+){parts["exclusion"]}\b", | ||||||
|  |         "annex": rf"\b{parts["annex"]}\s+(?P<annex_num>[DILMVX]+){parts["exclusion"]}\b", | ||||||
|  |     } | ||||||
|  |     for key, pattern in patterns.items(): | ||||||
|  |         matches = re.finditer(pattern, text, flags=re.IGNORECASE) | ||||||
|  |         print(f"Pattern: {pattern}") | ||||||
|  |         for match in matches: | ||||||
|  |             print(f"Match: {match.group(0)}") | ||||||
|  |             crossref_id = ( | ||||||
|  |                 match.group("art_num") if key == "article" else match.group("annex_num") | ||||||
|  |             ) | ||||||
|  |             crossref_text = match.group(0) | ||||||
|  |             crossrefs.append( | ||||||
|  |                 CrossReference(id=crossref_id, text=crossref_text, target=key) | ||||||
|  |             ) | ||||||
|  |     return crossrefs | ||||||
|  |  | ||||||
|  |  | ||||||
| class FormexArticleConverter: | class FormexArticleConverter: | ||||||
|     """Converts Formex XML <ARTICLE> elements to semantic HTML5.""" |     """Converts Formex XML <ARTICLE> elements to semantic HTML5.""" | ||||||
|  |  | ||||||
|     def __init__(self, namespace: Optional[str] = None): |     def __init__(self, language: Language, namespace: Optional[str] = None): | ||||||
|         """ |         """ | ||||||
|         Initialize the converter. |         Initialize the converter. | ||||||
|  |  | ||||||
|         Args: |         Args: | ||||||
|  |             language: Language object to determine the language for cross-reference extraction | ||||||
|             namespace: Optional XML namespace to use when parsing elements |             namespace: Optional XML namespace to use when parsing elements | ||||||
|         """ |         """ | ||||||
|         self.ns = namespace |         self.ns = namespace | ||||||
|  |         self.language = language | ||||||
|         self.ns_prefix = f"{{{namespace}}}" if namespace else "" |         self.ns_prefix = f"{{{namespace}}}" if namespace else "" | ||||||
|  |  | ||||||
|     def _get_tag(self, tag: str) -> str: |     def _get_tag(self, tag: str) -> str: | ||||||
| @@ -67,6 +133,15 @@ class FormexArticleConverter: | |||||||
|         clean_id = re.sub(r"[^a-zA-Z0-9-]", "-", identifier) |         clean_id = re.sub(r"[^a-zA-Z0-9-]", "-", identifier) | ||||||
|         return f"art-{clean_id}" |         return f"art-{clean_id}" | ||||||
|  |  | ||||||
|  |     def _replace_xref(self, text: str, xref: CrossReference) -> str: | ||||||
|  |         """Replace a cross-reference instance with semantic markup in the text.""" | ||||||
|  |         # Replace the cross-reference text with a link | ||||||
|  |         text = text.replace( | ||||||
|  |             xref.text, | ||||||
|  |             f'<a href="" data-target="{xref.target}" data-id="{xref.id}" class="cross-ref">{xref.text}</a>', | ||||||
|  |         ) | ||||||
|  |         return text | ||||||
|  |  | ||||||
|     def _convert_btx(self, element: ET.Element) -> str: |     def _convert_btx(self, element: ET.Element) -> str: | ||||||
|         """ |         """ | ||||||
|         Convert basic text elements (t_btx, t_btx.seq) to HTML. |         Convert basic text elements (t_btx, t_btx.seq) to HTML. | ||||||
| @@ -79,6 +154,16 @@ class FormexArticleConverter: | |||||||
|  |  | ||||||
|         result = element.text or "" |         result = element.text or "" | ||||||
|  |  | ||||||
|  |         is_title = element.tag in ("TI", "STI", "TI.ART", "STI.ART") | ||||||
|  |         if not is_title and not element.getchildren(): | ||||||
|  |             # Cross-references should be treated at the deepest level | ||||||
|  |             xrefs = extract_xrefs(element, self.language) | ||||||
|  |             print("Extracted cross-references: ", xrefs) | ||||||
|  |  | ||||||
|  |             for xref in xrefs: | ||||||
|  |                 # Replace the cross-reference text with a link | ||||||
|  |                 result = self._replace_xref(result, xref) | ||||||
|  |  | ||||||
|         for child in element: |         for child in element: | ||||||
|             child_tag = child.tag.replace(self.ns_prefix, "") |             child_tag = child.tag.replace(self.ns_prefix, "") | ||||||
|  |  | ||||||
| @@ -176,7 +261,13 @@ class FormexArticleConverter: | |||||||
|                 result += self._convert_btx(child) |                 result += self._convert_btx(child) | ||||||
|  |  | ||||||
|             if child.tail: |             if child.tail: | ||||||
|                 result += child.tail |                 xrefs = extract_xrefs(child, self.language) | ||||||
|  |                 tail_text = child.tail | ||||||
|  |                 for xref in xrefs: | ||||||
|  |                     # Replace the cross-reference text with a link | ||||||
|  |                     tail_text = self._replace_xref(tail_text, xref) | ||||||
|  |  | ||||||
|  |                 result += tail_text | ||||||
|  |  | ||||||
|         return result |         return result | ||||||
|  |  | ||||||
| @@ -200,6 +291,12 @@ class FormexArticleConverter: | |||||||
|                     if no_p is not None and txt is not None: |                     if no_p is not None and txt is not None: | ||||||
|                         num = self._get_text(no_p) |                         num = self._get_text(no_p) | ||||||
|                         text = self._get_text(txt) |                         text = self._get_text(txt) | ||||||
|  |  | ||||||
|  |                         # Handle cross-references within the text | ||||||
|  |                         xrefs = extract_xrefs(txt, self.language) | ||||||
|  |                         for xref in xrefs: | ||||||
|  |                             text = self._replace_xref(text, xref) | ||||||
|  |  | ||||||
|                         item_content += f'<span class="item-number">{num}</span> {text}' |                         item_content += f'<span class="item-number">{num}</span> {text}' | ||||||
|                 elif child_tag == "P": |                 elif child_tag == "P": | ||||||
|                     # Regular paragraph |                     # Regular paragraph | ||||||
|   | |||||||
| @@ -133,7 +133,7 @@ def article(celex_id: str, article_id: int, language: Language = Language.ENG): | |||||||
|         num = article.get("IDENTIFIER").lstrip("0") |         num = article.get("IDENTIFIER").lstrip("0") | ||||||
|         if num == str(article_id): |         if num == str(article_id): | ||||||
|             return Response( |             return Response( | ||||||
|                 FormexArticleConverter().convert_article(article), |                 FormexArticleConverter(language=language).convert_article(article), | ||||||
|                 media_type="text/html", |                 media_type="text/html", | ||||||
|             ) |             ) | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user