Coverage for src/wiktextract/extractor/zh/inflection.py: 92%
75 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from itertools import zip_longest
3from wikitextprocessor import NodeKind, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .models import Form, WordEntry
9# https://zh.wiktionary.org/wiki/Category:日語變格表模板
10JAPANESE_INFLECTION_TEMPLATE_PREFIXES = (
11 "ja-i",
12 "ja-adj-infl",
13 "ja-conj-bungo",
14 "ja-go",
15 "ja-honorific",
16 "ja-ichi",
17 "ja-kuru",
18 "ja-suru",
19 "ja-verbconj",
20 "ja-zuru",
21)
24def extract_inflections(
25 wxr: WiktextractContext,
26 page_data: list[WordEntry],
27 level_node: WikiNode,
28) -> None:
29 for child in level_node.find_child(NodeKind.TEMPLATE):
30 template_name = child.template_name.lower()
31 if template_name.startswith(JAPANESE_INFLECTION_TEMPLATE_PREFIXES): 31 ↛ 29line 31 didn't jump to line 29 because the condition on line 31 was always true
32 expanded_template = wxr.wtp.parse(
33 wxr.wtp.node_to_wikitext(level_node), expand_all=True
34 )
35 for table_node in expanded_template.find_child_recursively(
36 NodeKind.TABLE
37 ):
38 extract_ja_inf_table(wxr, page_data, table_node)
41def extract_ja_inf_table(
42 wxr: WiktextractContext,
43 page_data: list[WordEntry],
44 table_node: WikiNode,
45) -> None:
46 table_header = []
47 small_tags_dict = {}
48 for row_node in table_node.find_child(NodeKind.TABLE_ROW):
49 if len(list(row_node.filter_empty_str_child())) == 1:
50 has_small_tag = False
51 # table end tags
52 for small_tag in row_node.find_html_recursively("small"):
53 has_small_tag = True
54 tag_text = clean_node(wxr, None, small_tag)
55 if tag_text.startswith(("¹", "²")): 55 ↛ 52line 55 didn't jump to line 52 because the condition on line 55 was always true
56 small_tags_dict[tag_text[0]] = tag_text[1:].strip()
57 if not has_small_tag:
58 table_header = clean_node(wxr, None, row_node.children)
59 else:
60 form_list = []
61 hiragana_list = []
62 roman_list = []
63 raw_tags = []
64 small_tags = []
65 cell_node_index = 0
66 for row_child in row_node.find_child(
67 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
68 ):
69 if row_child.kind == NodeKind.TABLE_HEADER_CELL:
70 for line in clean_node(wxr, None, row_child).splitlines():
71 line = line.strip("() ")
72 if len(line) > 0: 72 ↛ 70line 72 didn't jump to line 70 because the condition on line 72 was always true
73 raw_tags.append(line)
74 elif row_child.kind == NodeKind.TABLE_CELL: 74 ↛ 66line 74 didn't jump to line 66 because the condition on line 74 was always true
75 if cell_node_index >= 3: 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true
76 break
77 for bold_node in row_child.find_child(NodeKind.BOLD):
78 # is row header
79 raw_tags.append(clean_node(wxr, None, bold_node))
80 continue
81 for span_tag in row_child.find_html("span"):
82 span_text = clean_node(wxr, None, row_child)
83 span_class = span_tag.attrs.get("class", "")
84 for line in span_text.splitlines():
85 if line == "-":
86 continue
87 if line.endswith(("¹", "²")):
88 if cell_node_index == 0:
89 small_tags.append(line[-1])
90 line = line[:-1]
91 if span_class == "Latn":
92 roman_list.append(line)
93 elif span_class == "Jpan": 93 ↛ 84line 93 didn't jump to line 84 because the condition on line 93 was always true
94 if cell_node_index == 0:
95 form_list.append(line)
96 elif cell_node_index == 1: 96 ↛ 84line 96 didn't jump to line 84 because the condition on line 96 was always true
97 hiragana_list.append(line)
98 cell_node_index += 1
99 break
101 for form, hiragana, roman, small_tag in zip_longest(
102 form_list, hiragana_list, roman_list, small_tags
103 ):
104 if form is None: 104 ↛ 105line 104 didn't jump to line 105 because the condition on line 104 was never true
105 continue
106 form_data = Form(
107 raw_tags=[table_header] + raw_tags,
108 source="inflection table",
109 form=form,
110 hiragana=hiragana or "",
111 roman=roman or "",
112 )
113 if small_tag is not None:
114 form_data.raw_tags.append(small_tag)
115 page_data[-1].forms.append(form_data)
117 for form_data in page_data[-1].forms:
118 if form_data.source == "inflection table": 118 ↛ 117line 118 didn't jump to line 117 because the condition on line 118 was always true
119 for index, raw_tag in enumerate(form_data.raw_tags):
120 if raw_tag in small_tags_dict:
121 form_data.raw_tags[index] = small_tags_dict[raw_tag]