Coverage for src/wiktextract/extractor/ms/translation.py: 76%
70 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
3from ...page import clean_node
4from ...wxr_context import WiktextractContext
5from .models import Translation, WordEntry
6from .tags import translate_raw_tags
9def extract_translation_section(
10 wxr: WiktextractContext,
11 page_data: list[WordEntry],
12 base_data: WordEntry,
13 level_node: LevelNode,
14) -> None:
15 sense = ""
16 tr_list = []
17 cats = {}
18 for node in level_node.children:
19 if isinstance(node, TemplateNode) and node.template_name in [
20 "ter-atas",
21 "teratas",
22 "trans-top",
23 ]:
24 sense = clean_node(wxr, cats, node)
25 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
26 for list_item in node.find_child(NodeKind.LIST_ITEM):
27 tr_list.extend(
28 extract_translation_list_item(wxr, list_item, sense)
29 )
31 if len(page_data) == 0 or page_data[-1].lang_code != base_data.lang_code: 31 ↛ 32line 31 didn't jump to line 32 because the condition on line 31 was never true
32 base_data.categories.extend(cats.get("categories", []))
33 for tr_data in tr_list:
34 if tr_data.word != "":
35 base_data.translations.append(tr_data)
36 base_data.categories.extend(tr_data.categories)
37 elif level_node.kind == NodeKind.LEVEL3: 37 ↛ 46line 37 didn't jump to line 46 because the condition on line 37 was always true
38 for data in page_data:
39 if data.lang_code == page_data[-1].lang_code: 39 ↛ 38line 39 didn't jump to line 38 because the condition on line 39 was always true
40 data.categories.extend(cats.get("categories", []))
41 for tr_data in tr_list:
42 if tr_data.word != "": 42 ↛ 41line 42 didn't jump to line 41 because the condition on line 42 was always true
43 data.translations.append(tr_data)
44 data.categories.extend(tr_data.categories)
45 else:
46 page_data[-1].categories.extend(cats.get("categories", []))
47 for tr_data in tr_list:
48 if tr_data.word != "":
49 page_data[-1].translations.append(tr_data)
50 page_data[-1].categories.extend(tr_data.categories)
53def extract_translation_list_item(
54 wxr: WiktextractContext, list_item: WikiNode, sense: str
55) -> None:
56 tr_list = []
57 lang_name = "unknown"
58 for node in list_item.children:
59 if (
60 isinstance(node, str)
61 and node.strip().endswith(":")
62 and lang_name == "unknown"
63 ):
64 lang_name = node.strip(": ") or "unknown"
65 elif isinstance(node, TemplateNode) and node.template_name in [
66 "t",
67 "trad",
68 "tø",
69 "t-",
70 "t+",
71 ]:
72 tr_list.append(extract_t_template(wxr, node, sense, lang_name))
73 elif ( 73 ↛ 79line 73 didn't jump to line 79 because the condition on line 73 was never true
74 isinstance(node, TemplateNode)
75 and node.template_name
76 in ["penerang", "qualifier", "i", "q", "qual"]
77 and len(tr_list) > 0
78 ):
79 raw_tag = clean_node(wxr, None, node).strip("() ")
80 if raw_tag != "":
81 tr_list[-1].raw_tags.append(raw_tag)
82 translate_raw_tags(tr_list[-1])
83 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
84 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
85 tr_list.extend(
86 extract_translation_list_item(wxr, child_list_item, sense)
87 )
88 return tr_list
91def extract_t_template(
92 wxr: WiktextractContext,
93 t_node: TemplateNode,
94 sense: str,
95 lang_name: str,
96) -> Translation:
97 lang_code = (
98 clean_node(wxr, None, t_node.template_parameters.get(1, ""))
99 or "unknown"
100 )
101 tr_data = Translation(
102 word="", lang=lang_name, lang_code=lang_code, sense=sense
103 )
104 expanded_node = wxr.wtp.parse(
105 wxr.wtp.node_to_wikitext(t_node), expand_all=True
106 )
107 for span_tag in expanded_node.find_html("span"):
108 if span_tag.attrs.get("lang") == lang_code and tr_data.word == "":
109 tr_data.word = clean_node(wxr, None, span_tag)
110 elif span_tag.attrs.get("class", "") == "gender":
111 for abbr_tag in span_tag.find_html("abbr"):
112 raw_tag = clean_node(wxr, None, abbr_tag)
113 if raw_tag not in ["", "?", "jantina tidak diberi"]: 113 ↛ 111line 113 didn't jump to line 111 because the condition on line 113 was always true
114 tr_data.raw_tags.append(raw_tag)
115 elif "tr" in span_tag.attrs.get("class", ""):
116 tr_data.roman = clean_node(wxr, None, span_tag)
117 if tr_data.word != "": 117 ↛ 121line 117 didn't jump to line 121 because the condition on line 117 was always true
118 translate_raw_tags(tr_data)
119 for link_node in expanded_node.find_child(NodeKind.LINK):
120 clean_node(wxr, tr_data, link_node)
121 return tr_data