Coverage for src/wiktextract/extractor/ja/linkage.py: 85%

1from mediawiki_langcodes import name_to_code

2from wikitextprocessor import (

3 HTMLNode,

4 LevelNode,

5 NodeKind,

6 TemplateNode,

7 WikiNode,

10from ...page import clean_node

11from ...wxr_context import WiktextractContext

12from ..ruby import extract_ruby

13from .models import Descendant, Form, Linkage, WordEntry

14from .section_titles import LINKAGES

15from .tags import translate_raw_tags

18def extract_linkage_section(

19 wxr: WiktextractContext,

20 word_entry: WordEntry,

21 level_node: LevelNode,

22 linkage_type: str,

23) -> None:

24 if linkage_type in ["cognates", "descendants"]:

25 extract_descendant_section(wxr, word_entry, level_node, linkage_type)

26 return

28 sense = ""

29 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE):

30 if isinstance(node, TemplateNode) and node.template_name.startswith(

31 "rel-top"

32 ):

33 sense = clean_node(wxr, None, node.template_parameters.get(1, ""))

34 elif node.kind == NodeKind.LIST: 34 ↛ 29line 34 didn't jump to line 29 because the condition on line 34 was always true

35 for list_item in node.find_child_recursively(NodeKind.LIST_ITEM):

36 linkage_type = process_linkage_list_item(

37 wxr, word_entry, list_item, linkage_type, sense

38 )

41def process_linkage_list_item(

42 wxr: WiktextractContext,

43 word_entry: WordEntry,

44 list_item: WikiNode,

45 linkage_type: str,

46 sense: str,

47) -> str:

48 after_colon = False

49 for node_idx, node in enumerate(list_item.children):

50 if isinstance(node, str) and ":" in node and not after_colon:

51 linkage_type_text = clean_node(

52 wxr, None, list_item.children[:node_idx]

53 )

54 linkage_type = LINKAGES.get(linkage_type_text, linkage_type)

55 after_colon = True

56 elif isinstance(node, TemplateNode) and node.template_name.startswith(

57 ("おくりがな", "ふりがな", "xlink")

58 ):

59 expanded_node = wxr.wtp.parse(

60 wxr.wtp.node_to_wikitext(node), expand_all=True

61 )

62 ruby, no_ruby = extract_ruby(wxr, expanded_node.children)

63 if node.template_name == "xlink":

64 ruby.clear()

65 word = clean_node(wxr, None, no_ruby)

66 if len(word) > 0: 66 ↛ 49line 66 didn't jump to line 49 because the condition on line 66 was always true

67 getattr(word_entry, linkage_type).append(

68 Linkage(word=word, ruby=ruby, sense=sense)

69 )

70 elif isinstance(node, TemplateNode) and node.template_name == "l": 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true

71 l_data = extract_l_template(wxr, node)

72 if l_data.word != "":

73 getattr(word_entry, linkage_type).append(l_data)

74 elif isinstance(node, TemplateNode) and node.template_name == "zh-l": 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true

75 getattr(word_entry, linkage_type).extend(

76 extract_zh_l_template(wxr, node)

77 )

78 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:

79 word = clean_node(wxr, None, node)

80 if len(word) > 0:

81 getattr(word_entry, linkage_type).append(

82 Linkage(word=word, sense=sense)

83 )

84 elif isinstance(node, TemplateNode) and node.template_name == "sense":

85 sense = clean_node(wxr, None, node).strip("(): ")

87 return linkage_type

90def extract_descendant_section(

91 wxr: WiktextractContext,

92 word_entry: WordEntry,

93 level_node: LevelNode,

94 linkage_type: str,

95) -> None:

96 desc_list = []

97 for list_node in level_node.find_child(NodeKind.LIST):

98 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

99 desc_list.extend(process_desc_list_item(wxr, list_item, []))

100 getattr(word_entry, linkage_type).extend(desc_list)

101

102

103def process_desc_list_item(

104 wxr: WiktextractContext, list_item: WikiNode, parent_list: list[Descendant]

105) -> list[Descendant]:

106 desc_list = []

107 lang_name = "unknown"

108 lang_code = "unknown"

109 for index, child in enumerate(list_item.children):

110 if isinstance(child, str) and ":" in child and lang_name == "unknown":

111 lang_name = clean_node(wxr, None, list_item.children[:index])

112 lang_code = name_to_code(lang_name, "ja")

113 elif isinstance(child, TemplateNode) and child.template_name == "etyl":

114 lang_name = clean_node(wxr, None, child)

115 lang_code = clean_node(

116 wxr, None, child.template_parameters.get(1, "")

117 )

118 elif isinstance(child, TemplateNode) and child.template_name == "l":

119 l_data = extract_l_template(wxr, child)

120 if l_data.word != "": 120 ↛ 109line 120 didn't jump to line 109 because the condition on line 120 was always true

121 desc_list.append(

122 Descendant(

123 word=l_data.word,

124 lang=lang_name,

125 lang_code=lang_code

126 or clean_node(

127 wxr, None, child.template_parameters.get(1, "")

128 ),

129 tags=l_data.tags,

130 raw_tags=l_data.raw_tags,

131 roman=l_data.roman,

132 sense=l_data.sense,

133 )

134 )

135 elif isinstance(child, TemplateNode) and child.template_name == "desc":

136 new_descs, lang_code, lang_name = extract_desc_template(wxr, child)

137 desc_list.extend(new_descs)

138 elif isinstance(child, TemplateNode) and child.template_name == "zh-l":

139 for l_data in extract_zh_l_template(wxr, child):

140 if l_data.word != "": 140 ↛ 139line 140 didn't jump to line 139 because the condition on line 140 was always true

141 desc_list.append(

142 Descendant(

143 word=l_data.word,

144 lang=lang_name,

145 lang_code=lang_code,

146 tags=l_data.tags,

147 roman=l_data.roman,

148 )

149 )

150 elif isinstance(child, WikiNode) and child.kind == NodeKind.LIST:

151 for next_list_item in child.find_child(NodeKind.LIST_ITEM):

152 process_desc_list_item(wxr, next_list_item, desc_list)

153

154 for p_data in parent_list:

155 p_data.descendants.extend(desc_list)

156 return desc_list

157

158

159# カテゴリ:文法テンプレート

160LINKAGE_TEMPLATES = {

161 "syn": "synonyms",

162 "ant": "antonyms",

163 "hyper": "hypernyms",

164 "hypo": "hyponyms",

165 "hyponyms": "hyponyms",

166 "mero": "meronyms",

167 "cot": "coordinate_terms",

168}

169

170

171def extract_gloss_list_linkage_template(

172 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode

173) -> None:

174 expanded_node = wxr.wtp.parse(

175 wxr.wtp.node_to_wikitext(t_node), expand_all=True

176 )

177 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

178 for span_tag in expanded_node.find_html(

179 "span", attr_name="lang", attr_value=lang_code

180 ):

181 word = clean_node(wxr, None, span_tag)

182 if word != "": 182 ↛ 178line 182 didn't jump to line 178 because the condition on line 182 was always true

183 getattr(word_entry, LINKAGE_TEMPLATES[t_node.template_name]).append(

184 Linkage(

185 word=word,

186 sense=" ".join(word_entry.senses[-1].glosses)

187 if len(word_entry.senses) > 0

188 and len(word_entry.senses[-1].glosses) > 0

189 else "",

190 )

191 )

192

193

194def extract_l_template(

195 wxr: WiktextractContext, t_node: TemplateNode

196) -> Linkage:

197 # https://ja.wiktionary.org/wiki/テンプレート:l

198 expanded_node = wxr.wtp.parse(

199 wxr.wtp.node_to_wikitext(t_node), expand_all=True

200 )

201 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

202 l_data = Linkage(word="")

203 for span_tag in expanded_node.find_html("span"):

204 span_lang = span_tag.attrs.get("lang", "")

205 span_class = span_tag.attrs.get("class", "")

206 if span_lang == lang_code:

207 l_data.word = clean_node(wxr, None, span_tag)

208 elif span_lang == lang_code + "-Latn":

209 l_data.roman = clean_node(wxr, None, span_tag)

210 elif span_class == "gender":

211 raw_tag = clean_node(wxr, None, span_tag)

212 if raw_tag != "": 212 ↛ 203line 212 didn't jump to line 203 because the condition on line 212 was always true

213 l_data.raw_tags.append(raw_tag)

214

215 if "lit" in t_node.template_parameters: 215 ↛ 216line 215 didn't jump to line 216 because the condition on line 215 was never true

216 l_data.literal_meaning = clean_node(

217 wxr, None, t_node.template_parameters["lit"]

218 )

219 for arg_name in (4, "gloss", "t"):

220 if arg_name in t_node.template_parameters:

221 l_data.sense = clean_node(

222 wxr, None, t_node.template_parameters[arg_name]

223 )

224 translate_raw_tags(l_data)

225 return l_data

226

227

228def extract_alt_form_section(

229 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode

230) -> None:

231 for node in level_node.find_child_recursively(

232 NodeKind.LINK | NodeKind.TEMPLATE

233 ):

234 if node.kind == NodeKind.LINK:

235 word = clean_node(wxr, None, node)

236 if word != "":

237 word_entry.forms.append(Form(form=word, tags=["alternative"]))

238 elif isinstance(node, TemplateNode) and node.template_name == "l":

239 l_data = extract_l_template(wxr, node)

240 if l_data.word != "":

241 word_entry.forms.append(

242 Form(

243 form=l_data.word,

244 tags=l_data.tags,

245 raw_tags=l_data.raw_tags,

246 roman=l_data.roman,

247 literal_meaning=l_data.literal_meaning,

248 )

249 )

250

251

252def extract_desc_template(

253 wxr: WiktextractContext, t_node: TemplateNode

254) -> tuple[list[Descendant], str, str]:

255 d_list = []

256 expanded_node = wxr.wtp.parse(

257 wxr.wtp.node_to_wikitext(t_node), expand_all=True

258 )

259 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

260 lang_name = "unknown"

261 for node in expanded_node.children:

262 if isinstance(node, str) and node.strip().endswith(":"):

263 lang_name = node.strip(": ")

264 elif (

265 isinstance(node, HTMLNode)

266 and node.tag == "span"

267 and lang_code == node.attrs.get("lang", "")

268 ):

269 for link_node in node.find_child(NodeKind.LINK):

270 word = clean_node(wxr, None, link_node)

271 if word != "": 271 ↛ 269line 271 didn't jump to line 269 because the condition on line 271 was always true

272 d_list.append(

273 Descendant(

274 lang=lang_name, lang_code=lang_code, word=word

275 )

276 )

277

278 return d_list, lang_code, lang_name

279

280

281def extract_zh_l_template(

282 wxr: WiktextractContext, t_node: TemplateNode

283) -> list[Linkage]:

284 l_list = []

285 expanded_node = wxr.wtp.parse(

286 wxr.wtp.node_to_wikitext(t_node), expand_all=True

287 )

288 roman = ""

289 for i_tag in expanded_node.find_html("i"):

290 roman = clean_node(wxr, None, i_tag)

291 for index, span_tag in enumerate(

292 expanded_node.find_html("span", attr_name="lang", attr_value="zh")

293 ):

294 word = clean_node(wxr, None, span_tag)

295 if word != "": 295 ↛ 291line 295 didn't jump to line 291 because the condition on line 295 was always true

296 l_list.append(

297 Linkage(

298 word=word,

299 tags=[

300 "Traditional-Chinese"

301 if index == 0

302 else "Simplified-Chinese"

303 ],

304 roman=roman,

305 )

306 )

307 return l_list