Coverage for src / wiktextract / extractor / ja / conjugation.py: 94%
166 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
1import re
3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from ..ruby import extract_ruby
8from .models import Form, WordEntry
9from .tags import translate_raw_tags
12def extract_conjugation_section(
13 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
14):
15 # https://ja.wiktionary.org/wiki/テンプレートの一覧/ja
16 for t_node in level_node.find_child(NodeKind.TEMPLATE):
17 if t_node.template_name in (
18 "日本語形容動詞活用",
19 "日本語五段活用",
20 "日本語五段活用/表示",
21 "日本語上一段活用",
22 "日本語上一段活用2",
23 "日本語下一段活用",
24 "日本語形容詞活用",
25 "日本語形容詞活用/表示",
26 "日本語形容詞活用2",
27 "日本語タルト活用",
28 "日本語ダ活用",
29 "日本語サ変活用",
30 "日本語一段活用",
31 "日本語カ変活用",
32 "日本語サ変活用",
33 "日本語ザ変活用",
34 "日本語変格活用", # has delete request
35 "古典日本語四段活用",
36 "古典日本語上一段活用",
37 "古典日本語上二段活用",
38 "古典日本語下一段活用",
39 "古典日本語下二段活用",
40 "古典日本語変格活用",
41 ):
42 extract_ja_conj_template(wxr, word_entry, t_node)
43 elif t_node.template_name in (
44 "日本語助動詞活用",
45 "古典日本語助動詞活用",
46 ):
47 extract_ja_auxiliary_verb_conj_template(wxr, word_entry, t_node)
48 elif t_node.template_name in (
49 "古典日本語ク活用",
50 "古典日本語シク活用",
51 "古典日本語ナリ活用",
52 "古典日本語タリ活用",
53 ):
54 extract_classical_ja_conj_template(wxr, word_entry, t_node)
57def extract_ja_conj_template(
58 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
59):
60 # extract templates use this Lua module
61 # https://ja.wiktionary.org/wiki/モジュール:日本語活用表
62 expanded_node = wxr.wtp.parse(
63 wxr.wtp.node_to_wikitext(t_node), expand_all=True
64 )
65 for link_node in expanded_node.find_child(NodeKind.LINK):
66 clean_node(wxr, word_entry, link_node)
67 for table_index, table_node in enumerate(
68 expanded_node.find_child_recursively(NodeKind.TABLE)
69 ):
70 if table_index == 0:
71 extract_ja_first_conj_table(wxr, word_entry, table_node)
72 elif table_index == 1: 72 ↛ 67line 72 didn't jump to line 67 because the condition on line 72 was always true
73 extract_ja_second_conj_table(wxr, word_entry, table_node)
76def extract_ja_first_conj_table(
77 wxr: WiktextractContext, word_entry: WordEntry, table: WikiNode
78):
79 table_caption = ""
80 top_header_tags = []
81 top_header = ""
82 col_headers = []
83 stem = ""
84 ruby = []
85 for row_or_caption in table.find_child(
86 NodeKind.TABLE_CAPTION | NodeKind.TABLE_ROW
87 ):
88 if row_or_caption.kind == NodeKind.TABLE_CAPTION:
89 table_caption = clean_node(wxr, None, row_or_caption.children)
90 elif row_or_caption.kind == NodeKind.TABLE_ROW: 90 ↛ 85line 90 didn't jump to line 85 because the condition on line 90 was always true
91 for col_index, cell_node in enumerate(
92 row_or_caption.find_child(
93 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
94 )
95 ):
96 cell_text = clean_node(wxr, None, cell_node)
97 if cell_node.kind == NodeKind.TABLE_HEADER_CELL:
98 if "colspan" in cell_node.attrs:
99 top_header = cell_text
100 top_header_tags = convert_ja_first_conj_table_header(
101 top_header
102 )
103 else:
104 col_headers.append(cell_text)
105 elif col_index == 0:
106 ruby, no_ruby_nodes = extract_ruby(wxr, cell_node)
107 no_ruby_text = clean_node(wxr, None, no_ruby_nodes).strip(
108 "()"
109 )
110 if no_ruby_text != "語幹無し":
111 stem = no_ruby_text
112 else:
113 for line in cell_text.splitlines():
114 line = line.strip("()")
115 if line != "無し":
116 form = Form(form=stem + line, ruby=ruby)
117 if table_caption != "": 117 ↛ 119line 117 didn't jump to line 119 because the condition on line 117 was always true
118 form.raw_tags.append(table_caption)
119 if len(top_header_tags) > 0:
120 form.tags.extend(top_header_tags)
121 elif top_header != "": 121 ↛ 123line 121 didn't jump to line 123 because the condition on line 121 was always true
122 form.raw_tags.append(top_header)
123 if col_index < len(col_headers): 123 ↛ 125line 123 didn't jump to line 125 because the condition on line 123 was always true
124 form.raw_tags.append(col_headers[col_index])
125 if form.form != "": 125 ↛ 113line 125 didn't jump to line 113 because the condition on line 125 was always true
126 translate_raw_tags(form)
127 word_entry.forms.append(form)
128 word_entry.tags.extend(top_header_tags)
131def extract_ja_second_conj_table(
132 wxr: WiktextractContext, word_entry: WordEntry, table: WikiNode
133):
134 table_caption = ""
135 for row_or_caption in table.find_child(
136 NodeKind.TABLE_CAPTION | NodeKind.TABLE_ROW
137 ):
138 if row_or_caption.kind == NodeKind.TABLE_CAPTION:
139 table_caption = clean_node(wxr, None, row_or_caption.children)
140 elif row_or_caption.kind == NodeKind.TABLE_ROW: 140 ↛ 135line 140 didn't jump to line 135 because the condition on line 140 was always true
141 row_header = ""
142 forms = []
143 for col_index, cell_node in enumerate(
144 row_or_caption.find_child(NodeKind.TABLE_CELL)
145 ):
146 ruby, no_ruby_nodes = extract_ruby(wxr, cell_node)
147 cell_text = clean_node(wxr, None, no_ruby_nodes)
148 if col_index == 0:
149 row_header = cell_text
150 elif col_index == 1:
151 for line in cell_text.splitlines():
152 form = Form(form=line, ruby=ruby)
153 if table_caption != "": 153 ↛ 155line 153 didn't jump to line 155 because the condition on line 153 was always true
154 form.raw_tags.append(table_caption)
155 if row_header != "": 155 ↛ 157line 155 didn't jump to line 157 because the condition on line 155 was always true
156 form.raw_tags.append(row_header)
157 if form.form != "": 157 ↛ 151line 157 didn't jump to line 151 because the condition on line 157 was always true
158 forms.append(form)
159 elif col_index == 2 and len(cell_text) > 3: 159 ↛ 143line 159 didn't jump to line 143 because the condition on line 159 was always true
160 for form in forms:
161 form.raw_tags.append(cell_text)
162 raw_tag = cell_text.removesuffix("のみ")
163 if "+" in raw_tag:
164 raw_tag = (
165 raw_tag[: raw_tag.index("+")]
166 .strip()
167 .removesuffix("音便")
168 )
169 if raw_tag != "": 169 ↛ 171line 169 didn't jump to line 171 because the condition on line 169 was always true
170 form.raw_tags.append(raw_tag)
171 translate_raw_tags(form)
172 word_entry.forms.extend(forms)
175def convert_ja_first_conj_table_header(header: str) -> list[str]:
176 # https://en.wikipedia.org/wiki/Japanese_conjugation
177 m = re.fullmatch(r"(.+?)行(.+?)活用", header)
178 if m is None:
179 return []
180 tags = []
181 katakana_map = {
182 "ア": "a",
183 "カ": "ka",
184 "ガ": "ga",
185 "サ": "sa",
186 "ザ": "za",
187 "タ": "ta",
188 "ダ": "da",
189 "ナ": "na",
190 "ハ": "ha",
191 "バ": "ba",
192 "マ": "ma",
193 "ラ": "ra",
194 "ワ": "wa",
195 }
196 verb_tags = {
197 "上一段": ["kamiichidan", "ichidan"],
198 "下一段": ["shimoichidan", "ichidan"],
199 "上二段": ["kaminidan", "nidan"],
200 "下二段": ["shimonidan", "nidan"],
201 "四段": ["yodan"],
202 "五段": ["godan"],
203 "変格": ["irregular"],
204 }
205 katakana, verb_type = m.groups()
206 if katakana in katakana_map: 206 ↛ 208line 206 didn't jump to line 208 because the condition on line 206 was always true
207 tags.append(f"{katakana_map[katakana]}-row")
208 tags.extend(verb_tags.get(verb_type, []))
209 return tags
212def extract_ja_auxiliary_verb_conj_template(
213 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
214):
215 forms = []
216 expanded_node = wxr.wtp.parse(
217 wxr.wtp.node_to_wikitext(t_node), expand_all=True
218 )
219 col_headers = []
220 raw_tag = ""
221 for table in expanded_node.find_child(NodeKind.TABLE):
222 for row in table.find_child(NodeKind.TABLE_ROW):
223 for col_index, cell in enumerate(
224 row.find_child(NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL)
225 ):
226 cell_text = clean_node(wxr, None, cell)
227 if cell.kind == NodeKind.TABLE_HEADER_CELL:
228 col_headers.append(cell_text)
229 elif col_index == 6:
230 raw_tag = cell_text
231 else:
232 for line in cell_text.splitlines():
233 word = line.strip("()○")
234 if word != "": 234 ↛ 232line 234 didn't jump to line 232 because the condition on line 234 was always true
235 form = Form(form=word)
236 if col_index < len(col_headers): 236 ↛ 238line 236 didn't jump to line 238 because the condition on line 236 was always true
237 form.raw_tags.append(col_headers[col_index])
238 forms.append(form)
239 for form in forms:
240 if raw_tag != "": 240 ↛ 242line 240 didn't jump to line 242 because the condition on line 240 was always true
241 form.raw_tags.append(raw_tag)
242 translate_raw_tags(form)
243 word_entry.forms.extend(forms)
246def extract_classical_ja_conj_template(
247 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
248):
249 forms = []
250 expanded_node = wxr.wtp.parse(
251 wxr.wtp.node_to_wikitext(t_node), expand_all=True
252 )
253 col_headers = []
254 stem = ""
255 raw_tag = ""
256 for table in expanded_node.find_child(NodeKind.TABLE):
257 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)):
258 for col_index, cell in enumerate(
259 row.find_child(NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL)
260 ):
261 cell_text = clean_node(wxr, None, cell)
262 if cell.kind == NodeKind.TABLE_HEADER_CELL:
263 col_headers.append(cell_text)
264 elif row_index == 1 and col_index == 1:
265 stem = cell_text
266 elif row_index == 1 and col_index == 8:
267 raw_tag = cell_text
268 elif not (row_index == 1 and col_index == 0):
269 for line in cell_text.splitlines():
270 line = line.strip("()○-")
271 if line != "":
272 form = Form(form=stem + line)
273 if row_index == 2:
274 col_index += 2
275 if col_index < len(col_headers): 275 ↛ 277line 275 didn't jump to line 277 because the condition on line 275 was always true
276 form.raw_tags.append(col_headers[col_index])
277 if form.form != "": 277 ↛ 269line 277 didn't jump to line 269 because the condition on line 277 was always true
278 forms.append(form)
279 for form in forms:
280 if raw_tag != "": 280 ↛ 282line 280 didn't jump to line 282 because the condition on line 280 was always true
281 form.raw_tags.append(raw_tag)
282 translate_raw_tags(form)
283 for link in expanded_node.find_child(NodeKind.LINK):
284 clean_node(wxr, word_entry, link)
285 word_entry.forms.extend(forms)