Coverage for src / wiktextract / extractor / zh / inflection.py: 92%
80 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
1import re
2from itertools import zip_longest
4from wikitextprocessor import NodeKind, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .models import Form, WordEntry
9from .tags import translate_raw_tags
11# https://zh.wiktionary.org/wiki/Category:日語變格表模板
12JAPANESE_INFLECTION_TEMPLATE_PREFIXES = (
13 "ja-i",
14 "ja-adj-infl",
15 "ja-conj-bungo",
16 "ja-go-",
17 "ja-honorific",
18 "ja-ichi",
19 "ja-kuru",
20 "ja-suru",
21 "ja-verbconj",
22 "ja-zuru",
23 "ja-na",
24)
27def extract_inflections(
28 wxr: WiktextractContext,
29 page_data: list[WordEntry],
30 level_node: WikiNode,
31) -> None:
32 for t_node in level_node.find_child(NodeKind.TEMPLATE):
33 if t_node.template_name.lower().startswith( 33 ↛ 32line 33 didn't jump to line 32 because the condition on line 33 was always true
34 JAPANESE_INFLECTION_TEMPLATE_PREFIXES
35 ):
36 expanded_template = wxr.wtp.parse(
37 wxr.wtp.node_to_wikitext(t_node), expand_all=True
38 )
39 for table_node in expanded_template.find_child_recursively(
40 NodeKind.TABLE
41 ):
42 extract_ja_inf_table(wxr, page_data, table_node)
45def extract_ja_inf_table(
46 wxr: WiktextractContext,
47 page_data: list[WordEntry],
48 table_node: WikiNode,
49) -> None:
50 table_header = ""
51 small_tags_dict = {}
52 for row_node in table_node.find_child(NodeKind.TABLE_ROW):
53 if len(list(row_node.filter_empty_str_child())) == 1:
54 has_small_tag = False
55 # table end tags
56 for small_tag in row_node.find_html_recursively("small"):
57 has_small_tag = True
58 for line in clean_node(wxr, None, small_tag).splitlines():
59 m = re.match(r"(¹|²|\^\(\[\d+\]\))", line)
60 if m is not None: 60 ↛ 58line 60 didn't jump to line 58 because the condition on line 60 was always true
61 small_tags_dict[line[: m.end()]] = line[
62 m.end() :
63 ].strip()
64 if not has_small_tag:
65 table_header = clean_node(wxr, None, row_node.children)
66 else:
67 form_list = []
68 hiragana_list = []
69 roman_list = []
70 raw_tags = []
71 small_tags = []
72 cell_node_index = 0
73 for row_child in row_node.find_child(
74 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
75 ):
76 if row_child.kind == NodeKind.TABLE_HEADER_CELL:
77 for line in clean_node(wxr, None, row_child).splitlines():
78 line = line.strip("() ")
79 if len(line) > 0: 79 ↛ 77line 79 didn't jump to line 77 because the condition on line 79 was always true
80 raw_tags.append(line)
81 elif row_child.kind == NodeKind.TABLE_CELL: 81 ↛ 73line 81 didn't jump to line 73 because the condition on line 81 was always true
82 if cell_node_index >= 3: 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true
83 break
84 for bold_node in row_child.find_child(NodeKind.BOLD):
85 # is row header
86 raw_tags.append(clean_node(wxr, None, bold_node))
87 continue
88 for span_tag in row_child.find_html("span"):
89 span_text = clean_node(wxr, None, row_child)
90 span_class = span_tag.attrs.get("class", "")
91 for line in span_text.splitlines():
92 if line == "-":
93 continue
94 m = re.search(r"(¹|²|\^\(\[\d+\]\))$", line)
95 if m is not None:
96 if cell_node_index == 0:
97 small_tags.append(m.group(1))
98 line = line[: m.start(1)]
99 if span_class == "Latn":
100 roman_list.append(line)
101 elif span_class == "Jpan": 101 ↛ 91line 101 didn't jump to line 91 because the condition on line 101 was always true
102 if cell_node_index == 0:
103 form_list.append(line)
104 elif cell_node_index == 1: 104 ↛ 91line 104 didn't jump to line 91 because the condition on line 104 was always true
105 hiragana_list.append(line)
106 cell_node_index += 1
107 break
109 for form, hiragana, roman, small_tag in zip_longest(
110 form_list, hiragana_list, roman_list, small_tags
111 ):
112 if form in [None, "", "-", wxr.wtp.title]: 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true
113 continue
114 form_data = Form(
115 raw_tags=[table_header] + raw_tags
116 if table_header != ""
117 else raw_tags,
118 source="inflection table",
119 form=form,
120 hiragana=hiragana or "",
121 roman=roman if roman not in [None, "", "-"] else "",
122 )
123 if small_tag is not None:
124 form_data.raw_tags.append(small_tag)
125 translate_raw_tags(form_data)
126 page_data[-1].forms.append(form_data)
128 for form_data in page_data[-1].forms:
129 if form_data.source == "inflection table": 129 ↛ 128line 129 didn't jump to line 128 because the condition on line 129 was always true
130 for index, raw_tag in enumerate(form_data.raw_tags):
131 if raw_tag in small_tags_dict:
132 form_data.raw_tags[index] = small_tags_dict[raw_tag]
133 translate_raw_tags(form_data)