Coverage for src/wiktextract/extractor/ja/linkage.py: 85%

146 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-06-13 07:43 +0000

1from mediawiki_langcodes import name_to_code 

2from wikitextprocessor import ( 

3 HTMLNode, 

4 LevelNode, 

5 NodeKind, 

6 TemplateNode, 

7 WikiNode, 

8) 

9 

10from ...page import clean_node 

11from ...wxr_context import WiktextractContext 

12from ..ruby import extract_ruby 

13from .models import Descendant, Form, Linkage, WordEntry 

14from .section_titles import LINKAGES 

15from .tags import translate_raw_tags 

16 

17 

18def extract_linkage_section( 

19 wxr: WiktextractContext, 

20 word_entry: WordEntry, 

21 level_node: LevelNode, 

22 linkage_type: str, 

23) -> None: 

24 if linkage_type in ["cognates", "descendants"]: 

25 extract_descendant_section(wxr, word_entry, level_node, linkage_type) 

26 return 

27 

28 sense = "" 

29 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE): 

30 if isinstance(node, TemplateNode) and node.template_name.startswith( 

31 "rel-top" 

32 ): 

33 sense = clean_node(wxr, None, node.template_parameters.get(1, "")) 

34 elif node.kind == NodeKind.LIST: 34 ↛ 29line 34 didn't jump to line 29 because the condition on line 34 was always true

35 for list_item in node.find_child_recursively(NodeKind.LIST_ITEM): 

36 linkage_type = process_linkage_list_item( 

37 wxr, word_entry, list_item, linkage_type, sense 

38 ) 

39 

40 

41def process_linkage_list_item( 

42 wxr: WiktextractContext, 

43 word_entry: WordEntry, 

44 list_item: WikiNode, 

45 linkage_type: str, 

46 sense: str, 

47) -> str: 

48 after_colon = False 

49 for node_idx, node in enumerate(list_item.children): 

50 if isinstance(node, str) and ":" in node and not after_colon: 

51 linkage_type_text = clean_node( 

52 wxr, None, list_item.children[:node_idx] 

53 ) 

54 linkage_type = LINKAGES.get(linkage_type_text, linkage_type) 

55 after_colon = True 

56 elif isinstance(node, TemplateNode) and node.template_name.startswith( 

57 ("おくりがな", "ふりがな", "xlink") 

58 ): 

59 expanded_node = wxr.wtp.parse( 

60 wxr.wtp.node_to_wikitext(node), expand_all=True 

61 ) 

62 ruby, no_ruby = extract_ruby(wxr, expanded_node.children) 

63 if node.template_name == "xlink": 

64 ruby.clear() 

65 word = clean_node(wxr, None, no_ruby) 

66 if len(word) > 0: 66 ↛ 49line 66 didn't jump to line 49 because the condition on line 66 was always true

67 getattr(word_entry, linkage_type).append( 

68 Linkage(word=word, ruby=ruby, sense=sense) 

69 ) 

70 elif isinstance(node, TemplateNode) and node.template_name == "l": 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true

71 l_data = extract_l_template(wxr, node) 

72 if l_data.word != "": 

73 getattr(word_entry, linkage_type).append(l_data) 

74 elif isinstance(node, TemplateNode) and node.template_name == "zh-l": 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true

75 getattr(word_entry, linkage_type).extend( 

76 extract_zh_l_template(wxr, node) 

77 ) 

78 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

79 word = clean_node(wxr, None, node) 

80 if len(word) > 0: 

81 getattr(word_entry, linkage_type).append( 

82 Linkage(word=word, sense=sense) 

83 ) 

84 elif isinstance(node, TemplateNode) and node.template_name == "sense": 

85 sense = clean_node(wxr, None, node).strip("(): ") 

86 

87 return linkage_type 

88 

89 

90def extract_descendant_section( 

91 wxr: WiktextractContext, 

92 word_entry: WordEntry, 

93 level_node: LevelNode, 

94 linkage_type: str, 

95) -> None: 

96 desc_list = [] 

97 for list_node in level_node.find_child(NodeKind.LIST): 

98 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

99 desc_list.extend(process_desc_list_item(wxr, list_item, [])) 

100 getattr(word_entry, linkage_type).extend(desc_list) 

101 

102 

103def process_desc_list_item( 

104 wxr: WiktextractContext, list_item: WikiNode, parent_list: list[Descendant] 

105) -> list[Descendant]: 

106 desc_list = [] 

107 lang_name = "unknown" 

108 lang_code = "unknown" 

109 for index, child in enumerate(list_item.children): 

110 if isinstance(child, str) and ":" in child and lang_name == "unknown": 

111 lang_name = clean_node(wxr, None, list_item.children[:index]) 

112 lang_code = name_to_code(lang_name, "ja") 

113 elif isinstance(child, TemplateNode) and child.template_name == "etyl": 

114 lang_name = clean_node(wxr, None, child) 

115 lang_code = clean_node( 

116 wxr, None, child.template_parameters.get(1, "") 

117 ) 

118 elif isinstance(child, TemplateNode) and child.template_name == "l": 

119 l_data = extract_l_template(wxr, child) 

120 if l_data.word != "": 120 ↛ 109line 120 didn't jump to line 109 because the condition on line 120 was always true

121 desc_list.append( 

122 Descendant( 

123 word=l_data.word, 

124 lang=lang_name, 

125 lang_code=lang_code 

126 or clean_node( 

127 wxr, None, child.template_parameters.get(1, "") 

128 ), 

129 tags=l_data.tags, 

130 raw_tags=l_data.raw_tags, 

131 roman=l_data.roman, 

132 sense=l_data.sense, 

133 ) 

134 ) 

135 elif isinstance(child, TemplateNode) and child.template_name == "desc": 

136 new_descs, lang_code, lang_name = extract_desc_template(wxr, child) 

137 desc_list.extend(new_descs) 

138 elif isinstance(child, TemplateNode) and child.template_name == "zh-l": 

139 for l_data in extract_zh_l_template(wxr, child): 

140 if l_data.word != "": 140 ↛ 139line 140 didn't jump to line 139 because the condition on line 140 was always true

141 desc_list.append( 

142 Descendant( 

143 word=l_data.word, 

144 lang=lang_name, 

145 lang_code=lang_code, 

146 tags=l_data.tags, 

147 roman=l_data.roman, 

148 ) 

149 ) 

150 elif isinstance(child, WikiNode) and child.kind == NodeKind.LIST: 

151 for next_list_item in child.find_child(NodeKind.LIST_ITEM): 

152 process_desc_list_item(wxr, next_list_item, desc_list) 

153 

154 for p_data in parent_list: 

155 p_data.descendants.extend(desc_list) 

156 return desc_list 

157 

158 

159# カテゴリ:文法テンプレート 

160LINKAGE_TEMPLATES = { 

161 "syn": "synonyms", 

162 "ant": "antonyms", 

163 "hyper": "hypernyms", 

164 "hypo": "hyponyms", 

165 "hyponyms": "hyponyms", 

166 "mero": "meronyms", 

167 "cot": "coordinate_terms", 

168} 

169 

170 

171def extract_gloss_list_linkage_template( 

172 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

173) -> None: 

174 expanded_node = wxr.wtp.parse( 

175 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

176 ) 

177 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

178 for span_tag in expanded_node.find_html( 

179 "span", attr_name="lang", attr_value=lang_code 

180 ): 

181 word = clean_node(wxr, None, span_tag) 

182 if word != "": 182 ↛ 178line 182 didn't jump to line 178 because the condition on line 182 was always true

183 getattr(word_entry, LINKAGE_TEMPLATES[t_node.template_name]).append( 

184 Linkage( 

185 word=word, 

186 sense=" ".join(word_entry.senses[-1].glosses) 

187 if len(word_entry.senses) > 0 

188 and len(word_entry.senses[-1].glosses) > 0 

189 else "", 

190 ) 

191 ) 

192 

193 

194def extract_l_template( 

195 wxr: WiktextractContext, t_node: TemplateNode 

196) -> Linkage: 

197 # https://ja.wiktionary.org/wiki/テンプレート:l 

198 expanded_node = wxr.wtp.parse( 

199 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

200 ) 

201 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

202 l_data = Linkage(word="") 

203 for span_tag in expanded_node.find_html("span"): 

204 span_lang = span_tag.attrs.get("lang", "") 

205 span_class = span_tag.attrs.get("class", "") 

206 if span_lang == lang_code: 

207 l_data.word = clean_node(wxr, None, span_tag) 

208 elif span_lang == lang_code + "-Latn": 

209 l_data.roman = clean_node(wxr, None, span_tag) 

210 elif span_class == "gender": 

211 raw_tag = clean_node(wxr, None, span_tag) 

212 if raw_tag != "": 212 ↛ 203line 212 didn't jump to line 203 because the condition on line 212 was always true

213 l_data.raw_tags.append(raw_tag) 

214 

215 if "lit" in t_node.template_parameters: 215 ↛ 216line 215 didn't jump to line 216 because the condition on line 215 was never true

216 l_data.literal_meaning = clean_node( 

217 wxr, None, t_node.template_parameters["lit"] 

218 ) 

219 for arg_name in (4, "gloss", "t"): 

220 if arg_name in t_node.template_parameters: 

221 l_data.sense = clean_node( 

222 wxr, None, t_node.template_parameters[arg_name] 

223 ) 

224 translate_raw_tags(l_data) 

225 return l_data 

226 

227 

228def extract_alt_form_section( 

229 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

230) -> None: 

231 for node in level_node.find_child_recursively( 

232 NodeKind.LINK | NodeKind.TEMPLATE 

233 ): 

234 if node.kind == NodeKind.LINK: 

235 word = clean_node(wxr, None, node) 

236 if word != "": 

237 word_entry.forms.append(Form(form=word, tags=["alternative"])) 

238 elif isinstance(node, TemplateNode) and node.template_name == "l": 

239 l_data = extract_l_template(wxr, node) 

240 if l_data.word != "": 

241 word_entry.forms.append( 

242 Form( 

243 form=l_data.word, 

244 tags=l_data.tags, 

245 raw_tags=l_data.raw_tags, 

246 roman=l_data.roman, 

247 literal_meaning=l_data.literal_meaning, 

248 ) 

249 ) 

250 

251 

252def extract_desc_template( 

253 wxr: WiktextractContext, t_node: TemplateNode 

254) -> tuple[list[Descendant], str, str]: 

255 d_list = [] 

256 expanded_node = wxr.wtp.parse( 

257 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

258 ) 

259 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

260 lang_name = "unknown" 

261 for node in expanded_node.children: 

262 if isinstance(node, str) and node.strip().endswith(":"): 

263 lang_name = node.strip(": ") 

264 elif ( 

265 isinstance(node, HTMLNode) 

266 and node.tag == "span" 

267 and lang_code == node.attrs.get("lang", "") 

268 ): 

269 for link_node in node.find_child(NodeKind.LINK): 

270 word = clean_node(wxr, None, link_node) 

271 if word != "": 271 ↛ 269line 271 didn't jump to line 269 because the condition on line 271 was always true

272 d_list.append( 

273 Descendant( 

274 lang=lang_name, lang_code=lang_code, word=word 

275 ) 

276 ) 

277 

278 return d_list, lang_code, lang_name 

279 

280 

281def extract_zh_l_template( 

282 wxr: WiktextractContext, t_node: TemplateNode 

283) -> list[Linkage]: 

284 l_list = [] 

285 expanded_node = wxr.wtp.parse( 

286 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

287 ) 

288 roman = "" 

289 for i_tag in expanded_node.find_html("i"): 

290 roman = clean_node(wxr, None, i_tag) 

291 for index, span_tag in enumerate( 

292 expanded_node.find_html("span", attr_name="lang", attr_value="zh") 

293 ): 

294 word = clean_node(wxr, None, span_tag) 

295 if word != "": 295 ↛ 291line 295 didn't jump to line 291 because the condition on line 295 was always true

296 l_list.append( 

297 Linkage( 

298 word=word, 

299 tags=[ 

300 "Traditional Chinese" 

301 if index == 0 

302 else "Simplified Chinese" 

303 ], 

304 roman=roman, 

305 ) 

306 ) 

307 return l_list