Coverage for src / wiktextract / extractor / ja / linkage.py: 90%

172 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-11-21 08:01 +0000

1from mediawiki_langcodes import name_to_code 

2from wikitextprocessor import ( 

3 HTMLNode, 

4 LevelNode, 

5 NodeKind, 

6 TemplateNode, 

7 WikiNode, 

8) 

9 

10from ...page import clean_node 

11from ...wxr_context import WiktextractContext 

12from ..ruby import extract_ruby 

13from .models import Descendant, Form, Linkage, WordEntry 

14from .section_titles import LINKAGES 

15from .tags import translate_raw_tags 

16 

17 

18def extract_linkage_section( 

19 wxr: WiktextractContext, 

20 word_entry: WordEntry, 

21 level_node: LevelNode, 

22 linkage_type: str, 

23) -> None: 

24 if linkage_type in ["cognates", "descendants"]: 

25 extract_descendant_section(wxr, word_entry, level_node, linkage_type) 

26 return 

27 

28 sense = "" 

29 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE): 

30 if isinstance(node, TemplateNode) and node.template_name.startswith( 

31 "rel-top" 

32 ): 

33 sense = clean_node(wxr, None, node.template_parameters.get(1, "")) 

34 elif node.kind == NodeKind.LIST: 34 ↛ 29line 34 didn't jump to line 29 because the condition on line 34 was always true

35 for list_item in node.find_child_recursively(NodeKind.LIST_ITEM): 

36 linkage_type = process_linkage_list_item( 

37 wxr, word_entry, list_item, linkage_type, sense 

38 ) 

39 

40 

41def process_linkage_list_item( 

42 wxr: WiktextractContext, 

43 word_entry: WordEntry, 

44 list_item: WikiNode, 

45 linkage_type: str, 

46 sense: str, 

47) -> str: 

48 after_colon = False 

49 for node_idx, node in enumerate(list_item.children): 

50 if isinstance(node, str) and ":" in node and not after_colon: 

51 linkage_type_text = clean_node( 

52 wxr, None, list_item.children[:node_idx] 

53 ) 

54 linkage_type = LINKAGES.get(linkage_type_text, linkage_type) 

55 after_colon = True 

56 elif isinstance(node, TemplateNode) and node.template_name.startswith( 

57 ("おくりがな", "ふりがな", "xlink") 

58 ): 

59 expanded_node = wxr.wtp.parse( 

60 wxr.wtp.node_to_wikitext(node), expand_all=True 

61 ) 

62 ruby, no_ruby = extract_ruby(wxr, expanded_node.children) 

63 if node.template_name == "xlink": 

64 ruby.clear() 

65 word = clean_node(wxr, None, no_ruby) 

66 if len(word) > 0: 66 ↛ 49line 66 didn't jump to line 49 because the condition on line 66 was always true

67 getattr(word_entry, linkage_type).append( 

68 Linkage(word=word, ruby=ruby, sense=sense) 

69 ) 

70 elif isinstance(node, TemplateNode) and node.template_name == "l": 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true

71 l_data = extract_l_template(wxr, node) 

72 if l_data.word != "": 

73 getattr(word_entry, linkage_type).append(l_data) 

74 elif isinstance(node, TemplateNode) and node.template_name == "zh-l": 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true

75 getattr(word_entry, linkage_type).extend( 

76 extract_zh_l_template(wxr, node) 

77 ) 

78 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

79 word = clean_node(wxr, None, node) 

80 if len(word) > 0: 

81 getattr(word_entry, linkage_type).append( 

82 Linkage(word=word, sense=sense) 

83 ) 

84 elif isinstance(node, TemplateNode) and node.template_name == "sense": 

85 sense = clean_node(wxr, None, node).strip("(): ") 

86 

87 return linkage_type 

88 

89 

90def extract_descendant_section( 

91 wxr: WiktextractContext, 

92 word_entry: WordEntry, 

93 level_node: LevelNode, 

94 linkage_type: str, 

95) -> None: 

96 desc_list = [] 

97 for list_node in level_node.find_child(NodeKind.LIST): 

98 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

99 desc_list.extend(process_desc_list_item(wxr, list_item, [])) 

100 getattr(word_entry, linkage_type).extend(desc_list) 

101 

102 

103def process_desc_list_item( 

104 wxr: WiktextractContext, list_item: WikiNode, parent_list: list[Descendant] 

105) -> list[Descendant]: 

106 desc_list = [] 

107 lang_name = "unknown" 

108 lang_code = "unknown" 

109 for index, child in enumerate(list_item.children): 

110 if isinstance(child, str) and ":" in child and lang_name == "unknown": 

111 lang_name = clean_node(wxr, None, list_item.children[:index]) 

112 lang_code = name_to_code(lang_name, "ja") 

113 elif isinstance(child, TemplateNode) and child.template_name == "etyl": 

114 lang_name = clean_node(wxr, None, child) 

115 lang_code = clean_node( 

116 wxr, None, child.template_parameters.get(1, "") 

117 ) 

118 elif isinstance(child, TemplateNode) and child.template_name == "l": 

119 l_data = extract_l_template(wxr, child) 

120 if l_data.word != "": 120 ↛ 109line 120 didn't jump to line 109 because the condition on line 120 was always true

121 desc_list.append( 

122 Descendant( 

123 word=l_data.word, 

124 lang=lang_name, 

125 lang_code=lang_code 

126 or clean_node( 

127 wxr, None, child.template_parameters.get(1, "") 

128 ), 

129 tags=l_data.tags, 

130 raw_tags=l_data.raw_tags, 

131 roman=l_data.roman, 

132 sense=l_data.sense, 

133 ) 

134 ) 

135 elif isinstance(child, TemplateNode) and child.template_name == "desc": 

136 new_descs, lang_code, lang_name = extract_desc_template(wxr, child) 

137 desc_list.extend(new_descs) 

138 elif isinstance(child, TemplateNode) and child.template_name == "zh-l": 

139 for l_data in extract_zh_l_template(wxr, child): 

140 if l_data.word != "": 140 ↛ 139line 140 didn't jump to line 139 because the condition on line 140 was always true

141 desc_list.append( 

142 Descendant( 

143 word=l_data.word, 

144 lang=lang_name, 

145 lang_code=lang_code, 

146 tags=l_data.tags, 

147 roman=l_data.roman, 

148 ) 

149 ) 

150 elif isinstance(child, WikiNode) and child.kind == NodeKind.LIST: 

151 for next_list_item in child.find_child(NodeKind.LIST_ITEM): 

152 process_desc_list_item(wxr, next_list_item, desc_list) 

153 

154 for p_data in parent_list: 

155 p_data.descendants.extend(desc_list) 

156 return desc_list 

157 

158 

159# カテゴリ:文法テンプレート 

160LINKAGE_TEMPLATES = { 

161 "syn": "synonyms", 

162 "ant": "antonyms", 

163 "hyper": "hypernyms", 

164 "hypo": "hyponyms", 

165 "hyponyms": "hyponyms", 

166 "mero": "meronyms", 

167 "cot": "coordinate_terms", 

168} 

169 

170 

171def extract_gloss_list_linkage_template( 

172 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

173) -> None: 

174 expanded_node = wxr.wtp.parse( 

175 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

176 ) 

177 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

178 for span_tag in expanded_node.find_html( 

179 "span", attr_name="lang", attr_value=lang_code 

180 ): 

181 word = clean_node(wxr, None, span_tag) 

182 if word != "": 182 ↛ 178line 182 didn't jump to line 178 because the condition on line 182 was always true

183 getattr(word_entry, LINKAGE_TEMPLATES[t_node.template_name]).append( 

184 Linkage( 

185 word=word, 

186 sense=" ".join(word_entry.senses[-1].glosses) 

187 if len(word_entry.senses) > 0 

188 and len(word_entry.senses[-1].glosses) > 0 

189 else "", 

190 ) 

191 ) 

192 

193 

194def extract_l_template( 

195 wxr: WiktextractContext, t_node: TemplateNode 

196) -> Linkage: 

197 # https://ja.wiktionary.org/wiki/テンプレート:l 

198 expanded_node = wxr.wtp.parse( 

199 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

200 ) 

201 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

202 l_data = Linkage(word="") 

203 for span_tag in expanded_node.find_html("span"): 

204 span_lang = span_tag.attrs.get("lang", "") 

205 span_class = span_tag.attrs.get("class", "") 

206 if span_lang == lang_code: 

207 l_data.word = clean_node(wxr, None, span_tag) 

208 elif span_lang == lang_code + "-Latn": 

209 l_data.roman = clean_node(wxr, None, span_tag) 

210 elif span_class == "gender": 

211 raw_tag = clean_node(wxr, None, span_tag) 

212 if raw_tag != "": 212 ↛ 203line 212 didn't jump to line 203 because the condition on line 212 was always true

213 l_data.raw_tags.append(raw_tag) 

214 

215 if "lit" in t_node.template_parameters: 215 ↛ 216line 215 didn't jump to line 216 because the condition on line 215 was never true

216 l_data.literal_meaning = clean_node( 

217 wxr, None, t_node.template_parameters["lit"] 

218 ) 

219 for arg_name in (4, "gloss", "t"): 

220 if arg_name in t_node.template_parameters: 

221 l_data.sense = clean_node( 

222 wxr, None, t_node.template_parameters[arg_name] 

223 ) 

224 translate_raw_tags(l_data) 

225 return l_data 

226 

227 

228def extract_alt_form_section( 

229 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

230) -> None: 

231 forms = [] 

232 parentheses = 0 

233 tag_nodes = [] 

234 

235 def add_tag(): 

236 if len(forms) > 0 and len(tag_nodes) > 0: 

237 raw_tag = clean_node(wxr, None, tag_nodes).strip("()() ") 

238 if raw_tag != "": 238 ↛ 241line 238 didn't jump to line 241 because the condition on line 238 was always true

239 forms[-1].raw_tags.append(raw_tag) 

240 translate_raw_tags(forms[-1]) 

241 tag_nodes.clear() 

242 

243 for list_node in level_node.find_child(NodeKind.LIST): 

244 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

245 for node in list_item.children: 

246 if ( 

247 isinstance(node, WikiNode) 

248 and node.kind == NodeKind.LINK 

249 and parentheses == 0 

250 ): 

251 word = clean_node(wxr, None, node) 

252 if word != "": 252 ↛ 245line 252 didn't jump to line 245 because the condition on line 252 was always true

253 forms.append(Form(form=word, tags=["alternative"])) 

254 add_tag() 

255 elif ( 255 ↛ 258line 255 didn't jump to line 258 because the condition on line 255 was never true

256 isinstance(node, TemplateNode) and node.template_name == "l" 

257 ): 

258 l_data = extract_l_template(wxr, node) 

259 if l_data.word != "": 

260 forms.append( 

261 Form( 

262 form=l_data.word, 

263 tags=l_data.tags + ["alternative"], 

264 raw_tags=l_data.raw_tags, 

265 roman=l_data.roman, 

266 literal_meaning=l_data.literal_meaning, 

267 ) 

268 ) 

269 add_tag() 

270 elif ( 

271 isinstance(node, str) 

272 and node.strip().startswith(("(", "(")) 

273 and node.strip().endswith((")", ")")) 

274 ): 

275 tag_nodes.append(node) 

276 elif isinstance(node, str) and ("(" in node or "(" in node): 

277 parentheses += 1 

278 tag_nodes.append(node) 

279 elif isinstance(node, str) and (")" in node or ")" in node): 

280 parentheses -= 1 

281 tag_nodes.append(node) 

282 elif parentheses > 0: 282 ↛ 245line 282 didn't jump to line 245 because the condition on line 282 was always true

283 tag_nodes.append(node) 

284 add_tag() 

285 word_entry.forms.extend(forms) 

286 

287 

288def extract_desc_template( 

289 wxr: WiktextractContext, t_node: TemplateNode 

290) -> tuple[list[Descendant], str, str]: 

291 d_list = [] 

292 expanded_node = wxr.wtp.parse( 

293 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

294 ) 

295 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

296 lang_name = "unknown" 

297 for node in expanded_node.children: 

298 if isinstance(node, str) and node.strip().endswith(":"): 

299 lang_name = node.strip(": ") 

300 elif ( 

301 isinstance(node, HTMLNode) 

302 and node.tag == "span" 

303 and lang_code == node.attrs.get("lang", "") 

304 ): 

305 for link_node in node.find_child(NodeKind.LINK): 

306 word = clean_node(wxr, None, link_node) 

307 if word != "": 307 ↛ 305line 307 didn't jump to line 305 because the condition on line 307 was always true

308 d_list.append( 

309 Descendant( 

310 lang=lang_name, lang_code=lang_code, word=word 

311 ) 

312 ) 

313 

314 return d_list, lang_code, lang_name 

315 

316 

317def extract_zh_l_template( 

318 wxr: WiktextractContext, t_node: TemplateNode 

319) -> list[Linkage]: 

320 l_list = [] 

321 expanded_node = wxr.wtp.parse( 

322 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

323 ) 

324 roman = "" 

325 for i_tag in expanded_node.find_html("i"): 

326 roman = clean_node(wxr, None, i_tag) 

327 for index, span_tag in enumerate( 

328 expanded_node.find_html("span", attr_name="lang", attr_value="zh") 

329 ): 

330 word = clean_node(wxr, None, span_tag) 

331 if word != "": 331 ↛ 327line 331 didn't jump to line 327 because the condition on line 331 was always true

332 l_list.append( 

333 Linkage( 

334 word=word, 

335 tags=[ 

336 "Traditional-Chinese" 

337 if index == 0 

338 else "Simplified-Chinese" 

339 ], 

340 roman=roman, 

341 ) 

342 ) 

343 return l_list