Coverage for src/wiktextract/extractor/zh/inflection.py: 92%
77 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-11 10:26 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-11 10:26 +0000
1from itertools import zip_longest
3from wikitextprocessor import NodeKind, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .models import Form, WordEntry
8from .tags import translate_raw_tags
10# https://zh.wiktionary.org/wiki/Category:日語變格表模板
11JAPANESE_INFLECTION_TEMPLATE_PREFIXES = (
12 "ja-i",
13 "ja-adj-infl",
14 "ja-conj-bungo",
15 "ja-go",
16 "ja-honorific",
17 "ja-ichi",
18 "ja-kuru",
19 "ja-suru",
20 "ja-verbconj",
21 "ja-zuru",
22)
25def extract_inflections(
26 wxr: WiktextractContext,
27 page_data: list[WordEntry],
28 level_node: WikiNode,
29) -> None:
30 for t_node in level_node.find_child(NodeKind.TEMPLATE):
31 if t_node.template_name.lower().startswith( 31 ↛ 30line 31 didn't jump to line 30 because the condition on line 31 was always true
32 JAPANESE_INFLECTION_TEMPLATE_PREFIXES
33 ):
34 expanded_template = wxr.wtp.parse(
35 wxr.wtp.node_to_wikitext(t_node), expand_all=True
36 )
37 for table_node in expanded_template.find_child_recursively(
38 NodeKind.TABLE
39 ):
40 extract_ja_inf_table(wxr, page_data, table_node)
43def extract_ja_inf_table(
44 wxr: WiktextractContext,
45 page_data: list[WordEntry],
46 table_node: WikiNode,
47) -> None:
48 table_header = ""
49 small_tags_dict = {}
50 for row_node in table_node.find_child(NodeKind.TABLE_ROW):
51 if len(list(row_node.filter_empty_str_child())) == 1:
52 has_small_tag = False
53 # table end tags
54 for small_tag in row_node.find_html_recursively("small"):
55 has_small_tag = True
56 tag_text = clean_node(wxr, None, small_tag)
57 if tag_text.startswith(("¹", "²")): 57 ↛ 54line 57 didn't jump to line 54 because the condition on line 57 was always true
58 small_tags_dict[tag_text[0]] = tag_text[1:].strip()
59 if not has_small_tag:
60 table_header = clean_node(wxr, None, row_node.children)
61 else:
62 form_list = []
63 hiragana_list = []
64 roman_list = []
65 raw_tags = []
66 small_tags = []
67 cell_node_index = 0
68 for row_child in row_node.find_child(
69 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
70 ):
71 if row_child.kind == NodeKind.TABLE_HEADER_CELL:
72 for line in clean_node(wxr, None, row_child).splitlines():
73 line = line.strip("() ")
74 if len(line) > 0: 74 ↛ 72line 74 didn't jump to line 72 because the condition on line 74 was always true
75 raw_tags.append(line)
76 elif row_child.kind == NodeKind.TABLE_CELL: 76 ↛ 68line 76 didn't jump to line 68 because the condition on line 76 was always true
77 if cell_node_index >= 3: 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true
78 break
79 for bold_node in row_child.find_child(NodeKind.BOLD):
80 # is row header
81 raw_tags.append(clean_node(wxr, None, bold_node))
82 continue
83 for span_tag in row_child.find_html("span"):
84 span_text = clean_node(wxr, None, row_child)
85 span_class = span_tag.attrs.get("class", "")
86 for line in span_text.splitlines():
87 if line == "-":
88 continue
89 if line.endswith(("¹", "²")):
90 if cell_node_index == 0:
91 small_tags.append(line[-1])
92 line = line[:-1]
93 if span_class == "Latn":
94 roman_list.append(line)
95 elif span_class == "Jpan": 95 ↛ 86line 95 didn't jump to line 86 because the condition on line 95 was always true
96 if cell_node_index == 0:
97 form_list.append(line)
98 elif cell_node_index == 1: 98 ↛ 86line 98 didn't jump to line 86 because the condition on line 98 was always true
99 hiragana_list.append(line)
100 cell_node_index += 1
101 break
103 for form, hiragana, roman, small_tag in zip_longest(
104 form_list, hiragana_list, roman_list, small_tags
105 ):
106 if form in [None, "", "-", wxr.wtp.title]: 106 ↛ 107line 106 didn't jump to line 107 because the condition on line 106 was never true
107 continue
108 form_data = Form(
109 raw_tags=[table_header] + raw_tags
110 if table_header != ""
111 else raw_tags,
112 source="inflection table",
113 form=form,
114 hiragana=hiragana or "",
115 roman=roman if roman not in [None, "", "-"] else "",
116 )
117 if small_tag is not None:
118 form_data.raw_tags.append(small_tag)
119 translate_raw_tags(form_data)
120 page_data[-1].forms.append(form_data)
122 for form_data in page_data[-1].forms:
123 if form_data.source == "inflection table": 123 ↛ 122line 123 didn't jump to line 122 because the condition on line 123 was always true
124 for index, raw_tag in enumerate(form_data.raw_tags):
125 if raw_tag in small_tags_dict:
126 form_data.raw_tags[index] = small_tags_dict[raw_tag]
127 translate_raw_tags(form_data)