Coverage for src / wiktextract / extractor / ja / conjugation.py: 94%

166 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 08:09 +0000

1import re 

2 

3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from ..ruby import extract_ruby 

8from .models import Form, WordEntry 

9from .tags import translate_raw_tags 

10 

11 

12def extract_conjugation_section( 

13 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

14): 

15 # https://ja.wiktionary.org/wiki/テンプレートの一覧/ja 

16 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

17 if t_node.template_name in ( 

18 "日本語形容動詞活用", 

19 "日本語五段活用", 

20 "日本語五段活用/表示", 

21 "日本語上一段活用", 

22 "日本語上一段活用2", 

23 "日本語下一段活用", 

24 "日本語形容詞活用", 

25 "日本語形容詞活用/表示", 

26 "日本語形容詞活用2", 

27 "日本語タルト活用", 

28 "日本語ダ活用", 

29 "日本語サ変活用", 

30 "日本語一段活用", 

31 "日本語カ変活用", 

32 "日本語サ変活用", 

33 "日本語ザ変活用", 

34 "日本語変格活用", # has delete request 

35 "古典日本語四段活用", 

36 "古典日本語上一段活用", 

37 "古典日本語上二段活用", 

38 "古典日本語下一段活用", 

39 "古典日本語下二段活用", 

40 "古典日本語変格活用", 

41 ): 

42 extract_ja_conj_template(wxr, word_entry, t_node) 

43 elif t_node.template_name in ( 

44 "日本語助動詞活用", 

45 "古典日本語助動詞活用", 

46 ): 

47 extract_ja_auxiliary_verb_conj_template(wxr, word_entry, t_node) 

48 elif t_node.template_name in ( 

49 "古典日本語ク活用", 

50 "古典日本語シク活用", 

51 "古典日本語ナリ活用", 

52 "古典日本語タリ活用", 

53 ): 

54 extract_classical_ja_conj_template(wxr, word_entry, t_node) 

55 

56 

57def extract_ja_conj_template( 

58 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

59): 

60 # extract templates use this Lua module 

61 # https://ja.wiktionary.org/wiki/モジュール:日本語活用表 

62 expanded_node = wxr.wtp.parse( 

63 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

64 ) 

65 for link_node in expanded_node.find_child(NodeKind.LINK): 

66 clean_node(wxr, word_entry, link_node) 

67 for table_index, table_node in enumerate( 

68 expanded_node.find_child_recursively(NodeKind.TABLE) 

69 ): 

70 if table_index == 0: 

71 extract_ja_first_conj_table(wxr, word_entry, table_node) 

72 elif table_index == 1: 72 ↛ 67line 72 didn't jump to line 67 because the condition on line 72 was always true

73 extract_ja_second_conj_table(wxr, word_entry, table_node) 

74 

75 

76def extract_ja_first_conj_table( 

77 wxr: WiktextractContext, word_entry: WordEntry, table: WikiNode 

78): 

79 table_caption = "" 

80 top_header_tags = [] 

81 top_header = "" 

82 col_headers = [] 

83 stem = "" 

84 ruby = [] 

85 for row_or_caption in table.find_child( 

86 NodeKind.TABLE_CAPTION | NodeKind.TABLE_ROW 

87 ): 

88 if row_or_caption.kind == NodeKind.TABLE_CAPTION: 

89 table_caption = clean_node(wxr, None, row_or_caption.children) 

90 elif row_or_caption.kind == NodeKind.TABLE_ROW: 90 ↛ 85line 90 didn't jump to line 85 because the condition on line 90 was always true

91 for col_index, cell_node in enumerate( 

92 row_or_caption.find_child( 

93 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

94 ) 

95 ): 

96 cell_text = clean_node(wxr, None, cell_node) 

97 if cell_node.kind == NodeKind.TABLE_HEADER_CELL: 

98 if "colspan" in cell_node.attrs: 

99 top_header = cell_text 

100 top_header_tags = convert_ja_first_conj_table_header( 

101 top_header 

102 ) 

103 else: 

104 col_headers.append(cell_text) 

105 elif col_index == 0: 

106 ruby, no_ruby_nodes = extract_ruby(wxr, cell_node) 

107 no_ruby_text = clean_node(wxr, None, no_ruby_nodes).strip( 

108 "()" 

109 ) 

110 if no_ruby_text != "語幹無し": 

111 stem = no_ruby_text 

112 else: 

113 for line in cell_text.splitlines(): 

114 line = line.strip("()") 

115 if line != "無し": 

116 form = Form(form=stem + line, ruby=ruby) 

117 if table_caption != "": 117 ↛ 119line 117 didn't jump to line 119 because the condition on line 117 was always true

118 form.raw_tags.append(table_caption) 

119 if len(top_header_tags) > 0: 

120 form.tags.extend(top_header_tags) 

121 elif top_header != "": 121 ↛ 123line 121 didn't jump to line 123 because the condition on line 121 was always true

122 form.raw_tags.append(top_header) 

123 if col_index < len(col_headers): 123 ↛ 125line 123 didn't jump to line 125 because the condition on line 123 was always true

124 form.raw_tags.append(col_headers[col_index]) 

125 if form.form != "": 125 ↛ 113line 125 didn't jump to line 113 because the condition on line 125 was always true

126 translate_raw_tags(form) 

127 word_entry.forms.append(form) 

128 word_entry.tags.extend(top_header_tags) 

129 

130 

131def extract_ja_second_conj_table( 

132 wxr: WiktextractContext, word_entry: WordEntry, table: WikiNode 

133): 

134 table_caption = "" 

135 for row_or_caption in table.find_child( 

136 NodeKind.TABLE_CAPTION | NodeKind.TABLE_ROW 

137 ): 

138 if row_or_caption.kind == NodeKind.TABLE_CAPTION: 

139 table_caption = clean_node(wxr, None, row_or_caption.children) 

140 elif row_or_caption.kind == NodeKind.TABLE_ROW: 140 ↛ 135line 140 didn't jump to line 135 because the condition on line 140 was always true

141 row_header = "" 

142 forms = [] 

143 for col_index, cell_node in enumerate( 

144 row_or_caption.find_child(NodeKind.TABLE_CELL) 

145 ): 

146 ruby, no_ruby_nodes = extract_ruby(wxr, cell_node) 

147 cell_text = clean_node(wxr, None, no_ruby_nodes) 

148 if col_index == 0: 

149 row_header = cell_text 

150 elif col_index == 1: 

151 for line in cell_text.splitlines(): 

152 form = Form(form=line, ruby=ruby) 

153 if table_caption != "": 153 ↛ 155line 153 didn't jump to line 155 because the condition on line 153 was always true

154 form.raw_tags.append(table_caption) 

155 if row_header != "": 155 ↛ 157line 155 didn't jump to line 157 because the condition on line 155 was always true

156 form.raw_tags.append(row_header) 

157 if form.form != "": 157 ↛ 151line 157 didn't jump to line 151 because the condition on line 157 was always true

158 forms.append(form) 

159 elif col_index == 2 and len(cell_text) > 3: 159 ↛ 143line 159 didn't jump to line 143 because the condition on line 159 was always true

160 for form in forms: 

161 form.raw_tags.append(cell_text) 

162 raw_tag = cell_text.removesuffix("のみ") 

163 if "+" in raw_tag: 

164 raw_tag = ( 

165 raw_tag[: raw_tag.index("+")] 

166 .strip() 

167 .removesuffix("音便") 

168 ) 

169 if raw_tag != "": 169 ↛ 171line 169 didn't jump to line 171 because the condition on line 169 was always true

170 form.raw_tags.append(raw_tag) 

171 translate_raw_tags(form) 

172 word_entry.forms.extend(forms) 

173 

174 

175def convert_ja_first_conj_table_header(header: str) -> list[str]: 

176 # https://en.wikipedia.org/wiki/Japanese_conjugation 

177 m = re.fullmatch(r"(.+?)行(.+?)活用", header) 

178 if m is None: 

179 return [] 

180 tags = [] 

181 katakana_map = { 

182 "ア": "a", 

183 "カ": "ka", 

184 "ガ": "ga", 

185 "サ": "sa", 

186 "ザ": "za", 

187 "タ": "ta", 

188 "ダ": "da", 

189 "ナ": "na", 

190 "ハ": "ha", 

191 "バ": "ba", 

192 "マ": "ma", 

193 "ラ": "ra", 

194 "ワ": "wa", 

195 } 

196 verb_tags = { 

197 "上一段": ["kamiichidan", "ichidan"], 

198 "下一段": ["shimoichidan", "ichidan"], 

199 "上二段": ["kaminidan", "nidan"], 

200 "下二段": ["shimonidan", "nidan"], 

201 "四段": ["yodan"], 

202 "五段": ["godan"], 

203 "変格": ["irregular"], 

204 } 

205 katakana, verb_type = m.groups() 

206 if katakana in katakana_map: 206 ↛ 208line 206 didn't jump to line 208 because the condition on line 206 was always true

207 tags.append(f"{katakana_map[katakana]}-row") 

208 tags.extend(verb_tags.get(verb_type, [])) 

209 return tags 

210 

211 

212def extract_ja_auxiliary_verb_conj_template( 

213 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

214): 

215 forms = [] 

216 expanded_node = wxr.wtp.parse( 

217 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

218 ) 

219 col_headers = [] 

220 raw_tag = "" 

221 for table in expanded_node.find_child(NodeKind.TABLE): 

222 for row in table.find_child(NodeKind.TABLE_ROW): 

223 for col_index, cell in enumerate( 

224 row.find_child(NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL) 

225 ): 

226 cell_text = clean_node(wxr, None, cell) 

227 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

228 col_headers.append(cell_text) 

229 elif col_index == 6: 

230 raw_tag = cell_text 

231 else: 

232 for line in cell_text.splitlines(): 

233 word = line.strip("()○") 

234 if word != "": 234 ↛ 232line 234 didn't jump to line 232 because the condition on line 234 was always true

235 form = Form(form=word) 

236 if col_index < len(col_headers): 236 ↛ 238line 236 didn't jump to line 238 because the condition on line 236 was always true

237 form.raw_tags.append(col_headers[col_index]) 

238 forms.append(form) 

239 for form in forms: 

240 if raw_tag != "": 240 ↛ 242line 240 didn't jump to line 242 because the condition on line 240 was always true

241 form.raw_tags.append(raw_tag) 

242 translate_raw_tags(form) 

243 word_entry.forms.extend(forms) 

244 

245 

246def extract_classical_ja_conj_template( 

247 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

248): 

249 forms = [] 

250 expanded_node = wxr.wtp.parse( 

251 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

252 ) 

253 col_headers = [] 

254 stem = "" 

255 raw_tag = "" 

256 for table in expanded_node.find_child(NodeKind.TABLE): 

257 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)): 

258 for col_index, cell in enumerate( 

259 row.find_child(NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL) 

260 ): 

261 cell_text = clean_node(wxr, None, cell) 

262 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

263 col_headers.append(cell_text) 

264 elif row_index == 1 and col_index == 1: 

265 stem = cell_text 

266 elif row_index == 1 and col_index == 8: 

267 raw_tag = cell_text 

268 elif not (row_index == 1 and col_index == 0): 

269 for line in cell_text.splitlines(): 

270 line = line.strip("()○-") 

271 if line != "": 

272 form = Form(form=stem + line) 

273 if row_index == 2: 

274 col_index += 2 

275 if col_index < len(col_headers): 275 ↛ 277line 275 didn't jump to line 277 because the condition on line 275 was always true

276 form.raw_tags.append(col_headers[col_index]) 

277 if form.form != "": 277 ↛ 269line 277 didn't jump to line 269 because the condition on line 277 was always true

278 forms.append(form) 

279 for form in forms: 

280 if raw_tag != "": 280 ↛ 282line 280 didn't jump to line 282 because the condition on line 280 was always true

281 form.raw_tags.append(raw_tag) 

282 translate_raw_tags(form) 

283 for link in expanded_node.find_child(NodeKind.LINK): 

284 clean_node(wxr, word_entry, link) 

285 word_entry.forms.extend(forms)