Correctly extract TOC text entries
The previous code could not correctly handle nested XML elements in the TOC text entries.
This commit is contained in:
@@ -68,9 +68,16 @@ def article_ids(celex_id: str, language: Language = Language.ENG):
|
|||||||
|
|
||||||
@api_router.get("/{celex_id}/toc/{language}")
|
@api_router.get("/{celex_id}/toc/{language}")
|
||||||
def toc(celex_id: str, language: Language = Language.ENG):
|
def toc(celex_id: str, language: Language = Language.ENG):
|
||||||
|
def _extract_text(root: ET.Element, tag: str) -> str:
|
||||||
|
"""
|
||||||
|
Extract text from the given tag in the XML element.
|
||||||
|
"""
|
||||||
|
text = root.xpath(f"{tag}//text()")
|
||||||
|
return "".join(text) if text else ""
|
||||||
|
|
||||||
def _handle_division(division: ET.Element, level: int):
|
def _handle_division(division: ET.Element, level: int):
|
||||||
title = ti_el[0] if (ti_el := division.xpath("TITLE/TI//text()")) else ""
|
title = _extract_text(division, "TITLE/TI")
|
||||||
subtitle = sti_el[0] if (sti_el := division.xpath("TITLE/STI//text()")) else ""
|
subtitle = _extract_text(division, "TITLE/STI")
|
||||||
|
|
||||||
subdivisions = []
|
subdivisions = []
|
||||||
for subdivision in division.xpath("DIVISION") or []:
|
for subdivision in division.xpath("DIVISION") or []:
|
||||||
@@ -81,10 +88,9 @@ def toc(celex_id: str, language: Language = Language.ENG):
|
|||||||
art_id = article.get("IDENTIFIER")
|
art_id = article.get("IDENTIFIER")
|
||||||
if not art_id:
|
if not art_id:
|
||||||
continue
|
continue
|
||||||
art_title = ti_el[0] if (ti_el := article.xpath("TI.ART//text()")) else ""
|
|
||||||
art_subtitle = (
|
art_title = _extract_text(article, "TI.ART")
|
||||||
sti_el[0] if (sti_el := article.xpath("STI.ART//text()")) else ""
|
art_subtitle = _extract_text(article, "STI.ART")
|
||||||
)
|
|
||||||
articles.append(
|
articles.append(
|
||||||
{
|
{
|
||||||
"id": int(art_id.lstrip("0")),
|
"id": int(art_id.lstrip("0")),
|
||||||
|
|||||||
Reference in New Issue
Block a user