From f21158c6c0d1889517d0836a17dbf552f574605f Mon Sep 17 00:00:00 2001 From: Adrian Rumpold Date: Fri, 25 Apr 2025 09:18:23 +0200 Subject: [PATCH] Correctly extract TOC text entries The previous code could not correctly handle nested XML elements in the TOC text entries. --- src/formex_viewer/server.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/formex_viewer/server.py b/src/formex_viewer/server.py index dc13c55..75f7088 100644 --- a/src/formex_viewer/server.py +++ b/src/formex_viewer/server.py @@ -68,9 +68,16 @@ def article_ids(celex_id: str, language: Language = Language.ENG): @api_router.get("/{celex_id}/toc/{language}") def toc(celex_id: str, language: Language = Language.ENG): + def _extract_text(root: ET.Element, tag: str) -> str: + """ + Extract text from the given tag in the XML element. + """ + text = root.xpath(f"{tag}//text()") + return "".join(text) if text else "" + def _handle_division(division: ET.Element, level: int): - title = ti_el[0] if (ti_el := division.xpath("TITLE/TI//text()")) else "" - subtitle = sti_el[0] if (sti_el := division.xpath("TITLE/STI//text()")) else "" + title = _extract_text(division, "TITLE/TI") + subtitle = _extract_text(division, "TITLE/STI") subdivisions = [] for subdivision in division.xpath("DIVISION") or []: @@ -81,10 +88,9 @@ def toc(celex_id: str, language: Language = Language.ENG): art_id = article.get("IDENTIFIER") if not art_id: continue - art_title = ti_el[0] if (ti_el := article.xpath("TI.ART//text()")) else "" - art_subtitle = ( - sti_el[0] if (sti_el := article.xpath("STI.ART//text()")) else "" - ) + + art_title = _extract_text(article, "TI.ART") + art_subtitle = _extract_text(article, "STI.ART") articles.append( { "id": int(art_id.lstrip("0")),