Coverage for src/wiktextract/extractor/ms/linkage.py: 86%
92 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1from collections import defaultdict
3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .models import Form, Linkage, WordEntry
8from .section_titles import LINKAGE_SECTIONS
11def extract_form_section(
12 wxr: WiktextractContext,
13 word_entry: WordEntry,
14 level_node: LevelNode,
15 tags: list[str],
16) -> None:
17 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LINK):
18 if ( 18 ↛ 17line 18 didn't jump to line 17 because the condition on line 18 was always true
19 isinstance(node, TemplateNode)
20 and node.template_name in ["ARchar", "Arab", "PSchar", "SDchar"]
21 ) or node.kind == NodeKind.LINK:
22 word = clean_node(wxr, None, node)
23 if word != "": 23 ↛ 17line 23 didn't jump to line 17 because the condition on line 23 was always true
24 word_entry.forms.append(Form(form=word, tags=tags))
25 for list_node in level_node.find_child(NodeKind.LIST): 25 ↛ 26line 25 didn't jump to line 26 because the loop on line 25 never started
26 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
27 for node in list_item.find_child(NodeKind.LINK):
28 word = clean_node(wxr, None, node)
29 if word != "":
30 word_entry.forms.append(Form(form=word, tags=tags))
33def extract_linkage_section(
34 wxr: WiktextractContext,
35 page_data: list[WordEntry],
36 base_data: WordEntry,
37 level_node: LevelNode,
38) -> None:
39 l_dict = defaultdict(list)
40 linkage_name = clean_node(wxr, None, level_node.largs).lower()
41 for list_node in level_node.find_child(NodeKind.LIST):
42 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
43 new_l_name = extract_linkage_list_item(
44 wxr, l_dict, linkage_name, list_item
45 )
46 if new_l_name != "": 46 ↛ 42line 46 didn't jump to line 42 because the condition on line 46 was always true
47 linkage_name = new_l_name
49 if len(page_data) == 0 or page_data[-1].lang_code != base_data.lang_code:
50 for field, data in l_dict.items():
51 getattr(base_data, field).extend(data)
52 elif level_node.kind == NodeKind.LEVEL3:
53 for data in page_data:
54 if data.lang_code == page_data[-1].lang_code: 54 ↛ 53line 54 didn't jump to line 53 because the condition on line 54 was always true
55 for field, l_data in l_dict.items():
56 getattr(data, field).extend(l_data)
57 else:
58 for field, l_data in l_dict.items():
59 getattr(page_data[-1], field).extend(l_data)
62def extract_linkage_list_item(
63 wxr: WiktextractContext,
64 l_dict: dict[str, list[Linkage]],
65 linkage_name: str,
66 list_item: WikiNode,
67) -> str:
68 if list_item.definition is not None and len(list_item.definition) > 0:
69 linkage_name = clean_node(wxr, None, list_item.children).lower()
70 if linkage_name not in LINKAGE_SECTIONS: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true
71 return ""
72 for node in list_item.definition:
73 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
74 word = clean_node(wxr, None, node)
75 if word != "": 75 ↛ 72line 75 didn't jump to line 72 because the condition on line 75 was always true
76 l_dict[LINKAGE_SECTIONS[linkage_name]].append(
77 Linkage(word=word)
78 )
79 elif isinstance(node, str): 79 ↛ 72line 79 didn't jump to line 72 because the condition on line 79 was always true
80 for word in node.split(","):
81 word = word.strip(" .\n")
82 if word != "":
83 l_dict[LINKAGE_SECTIONS[linkage_name]].append(
84 Linkage(word=word)
85 )
86 elif list_item.contain_node(NodeKind.BOLD):
87 extract_proverb_list(
88 wxr, l_dict, list_item, LINKAGE_SECTIONS[linkage_name]
89 )
90 else:
91 sense = ""
92 for node in list_item.children:
93 if isinstance(node, TemplateNode) and node.template_name == "sense":
94 sense = clean_node(wxr, None, node).strip("(): ")
95 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
96 word = clean_node(wxr, None, node)
97 if word != "" and linkage_name in LINKAGE_SECTIONS: 97 ↛ 92line 97 didn't jump to line 92 because the condition on line 97 was always true
98 l_dict[LINKAGE_SECTIONS[linkage_name]].append(
99 Linkage(word=word, sense=sense)
100 )
101 elif isinstance(node, str) and node.strip().endswith(":"):
102 new_linkage_name = node.strip("(): ").lower()
103 if new_linkage_name in LINKAGE_SECTIONS: 103 ↛ 92line 103 didn't jump to line 92 because the condition on line 103 was always true
104 linkage_name = new_linkage_name
106 return linkage_name
109LINKAGE_TEMPLATES = {
110 "antonim": "antonyms",
111 "ant": "antonyms",
112 "antonyms": "antonyms",
113 "sinonim": "synonyms",
114 "synonyms": "synonyms",
115 "syn": "synonyms",
116 "sin": "synonyms",
117 "hypernyms": "hypernyms",
118 "hyper": "hypernyms",
119 "kata setara": "coordinate_terms",
120 "coordinate terms": "coordinate_terms",
121 "perkataan koordinat": "coordinate_terms",
122 "cot": "coordinate_terms",
123 "hiponim": "hyponyms",
124 "hipo": "hyponyms",
125 "hyponyms": "hyponyms",
126}
129def extract_nyms_template(
130 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
131):
132 # Modul:nyms
133 expanded_node = wxr.wtp.parse(
134 wxr.wtp.node_to_wikitext(t_node), expand_all=True
135 )
136 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
137 for span_tag in expanded_node.find_html_recursively("span"):
138 if lang_code == span_tag.attrs.get("lang", ""):
139 word = clean_node(wxr, None, span_tag)
140 if word != "": 140 ↛ 137line 140 didn't jump to line 137 because the condition on line 140 was always true
141 l_data = Linkage(word=word)
142 if ( 142 ↛ 147line 142 didn't jump to line 147 because the condition on line 142 was always true
143 len(word_entry.senses) > 0
144 and len(word_entry.senses[-1].glosses) > 0
145 ):
146 l_data.sense = " ".join(word_entry.senses[-1].glosses)
147 getattr(
148 word_entry, LINKAGE_TEMPLATES[t_node.template_name]
149 ).append(l_data)
152def extract_proverb_list(
153 wxr: WiktextractContext,
154 l_dict: dict[str, list[Linkage]],
155 list_item: WikiNode,
156 linkage_type: str,
157) -> None:
158 proverbs = []
159 after_bold = False
160 sense = ""
161 for index, node in enumerate(list_item.children):
162 if isinstance(node, WikiNode) and node.kind == NodeKind.BOLD:
163 proverb = clean_node(wxr, None, node)
164 if proverb != "": 164 ↛ 166line 164 didn't jump to line 166 because the condition on line 164 was always true
165 proverbs.append(proverb)
166 after_bold = True
167 elif after_bold and isinstance(node, str) and ":" in node:
168 sense = clean_node(
169 wxr,
170 None,
171 [node[node.index(":") + 1 :]] + list_item.children[index + 1 :],
172 )
173 for proverb in proverbs:
174 l_dict[linkage_type].append(Linkage(word=proverb, sense=sense))