Coverage for src / wiktextract / extractor / ja / pos.py: 83%
103 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-26 08:59 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-26 08:59 +0000
1from wikitextprocessor.parser import (
2 LEVEL_KIND_FLAGS,
3 LevelNode,
4 NodeKind,
5 TemplateNode,
6 WikiNode,
7)
9from ...page import clean_node
10from ...wxr_context import WiktextractContext
11from ..ruby import extract_ruby
12from .example import extract_example_list_item
13from .header import extract_header_nodes
14from .models import AltForm, Sense, WordEntry
15from .section_titles import POS_DATA
16from .tags import translate_raw_tags
19def parse_pos_section(
20 wxr: WiktextractContext,
21 page_data: list[WordEntry],
22 base_data: WordEntry,
23 level_node: LevelNode,
24 pos_title: str,
25) -> None:
26 from .conjugation import extract_conjugation_section
28 page_data.append(base_data.model_copy(deep=True))
29 page_data[-1].pos_title = pos_title
30 pos_data = POS_DATA[pos_title]
31 page_data[-1].pos = pos_data["pos"]
32 page_data[-1].tags.extend(pos_data.get("tags", []))
34 gloss_list_start = 0
35 for list_index, list_node in level_node.find_child(NodeKind.LIST, True):
36 if not list_node.sarg.endswith("#"): # linkage list
37 continue
38 if gloss_list_start == 0: 38 ↛ 35line 38 didn't jump to line 35 because the condition on line 38 was always true
39 gloss_list_start = list_index
40 extract_header_nodes(
41 wxr, page_data[-1], level_node.children[:gloss_list_start]
42 )
43 for list_index, list_node in level_node.find_child(NodeKind.LIST, True):
44 if not list_node.sarg.endswith("#"): # linkage list
45 continue
46 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
47 process_gloss_list_item(wxr, page_data[-1], list_item)
48 old_forms_len = len(page_data[-1].forms)
49 extract_conjugation_section(wxr, page_data[-1], level_node)
50 if gloss_list_start == 0 and len(page_data[-1].forms) == old_forms_len:
51 page_data.pop()
54def process_gloss_list_item(
55 wxr: WiktextractContext,
56 word_entry: WordEntry,
57 list_item_node: WikiNode,
58 parent_gloss: str = "",
59) -> None:
60 gloss_nodes = list(
61 list_item_node.invert_find_child(NodeKind.LIST, include_empty_str=True)
62 )
63 sense_data = Sense()
64 find_form_of_data(wxr, word_entry, sense_data, list_item_node)
65 if len(parent_gloss) > 0:
66 sense_data.glosses.append(parent_gloss)
67 gloss_only_nodes = []
68 for gloss_node in gloss_nodes:
69 if isinstance(gloss_node, TemplateNode):
70 if gloss_node.template_name in ("context", "タグ"):
71 # https://ja.wiktionary.org/wiki/テンプレート:context
72 # https://ja.wiktionary.org/wiki/テンプレート:タグ
73 for raw_tag in (
74 clean_node(wxr, sense_data, gloss_node)
75 .strip("()")
76 .split(",")
77 ):
78 raw_tag = raw_tag.strip()
79 if len(raw_tag) > 0: 79 ↛ 73line 79 didn't jump to line 73 because the condition on line 79 was always true
80 sense_data.raw_tags.append(raw_tag)
81 elif gloss_node.template_name == "wikipedia-s":
82 expanded_text = clean_node(wxr, None, gloss_node)
83 gloss_only_nodes.append(
84 expanded_text.removesuffix("⁽ʷᵖ⁾").strip()
85 )
86 elif gloss_node.template_name == "wp": 86 ↛ 87line 86 didn't jump to line 87 because the condition on line 86 was never true
87 continue
88 elif gloss_node.template_name == "lb":
89 extract_lb_template(wxr, sense_data, gloss_node)
90 else:
91 gloss_only_nodes.append(gloss_node)
92 else:
93 gloss_only_nodes.append(gloss_node)
94 expanded_gloss = wxr.wtp.parse(
95 wxr.wtp.node_to_wikitext(gloss_only_nodes), expand_all=True
96 )
97 ruby, no_ruby = extract_ruby(wxr, expanded_gloss.children)
98 gloss_text = clean_node(wxr, sense_data, no_ruby)
99 sense_data.ruby = ruby
100 if len(gloss_text) > 0: 100 ↛ 105line 100 didn't jump to line 105 because the condition on line 100 was always true
101 sense_data.glosses.append(gloss_text)
102 translate_raw_tags(sense_data)
103 word_entry.senses.append(sense_data)
105 for nest_gloss_list in list_item_node.find_child(NodeKind.LIST):
106 if nest_gloss_list.sarg.endswith(("*", ":")):
107 for example_list_item in nest_gloss_list.find_child(
108 NodeKind.LIST_ITEM
109 ):
110 extract_example_list_item(
111 wxr, word_entry, sense_data, example_list_item
112 )
113 elif nest_gloss_list.sarg.endswith("#"): 113 ↛ 105line 113 didn't jump to line 105 because the condition on line 113 was always true
114 for nest_list_item in nest_gloss_list.find_child(
115 NodeKind.LIST_ITEM
116 ):
117 process_gloss_list_item(
118 wxr, word_entry, nest_list_item, gloss_text
119 )
122def find_form_of_data(
123 wxr: WiktextractContext,
124 word_entry: WordEntry,
125 sense: Sense,
126 list_item_node: WikiNode,
127) -> None:
128 for node in list_item_node.find_child(NodeKind.TEMPLATE):
129 if node.template_name.endswith(" of"):
130 expanded_node = wxr.wtp.parse(
131 wxr.wtp.node_to_wikitext(node), expand_all=True
132 )
133 for link_node in expanded_node.find_child_recursively( 133 ↛ 128line 133 didn't jump to line 128 because the loop on line 133 didn't complete
134 NodeKind.LINK
135 ):
136 form_of = clean_node(wxr, None, link_node)
137 if form_of != "": 137 ↛ 133line 137 didn't jump to line 133 because the condition on line 137 was always true
138 sense.form_of.append(AltForm(word=form_of))
139 break
140 if "form-of" in word_entry.tags and len(sense.form_of) == 0:
141 for link_node in list_item_node.find_child(NodeKind.LINK): 141 ↛ exitline 141 didn't return from function 'find_form_of_data' because the loop on line 141 didn't complete
142 form_of = clean_node(wxr, None, link_node)
143 if form_of != "": 143 ↛ 141line 143 didn't jump to line 141 because the condition on line 143 was always true
144 sense.form_of.append(AltForm(word=form_of))
145 sense.tags.append("form-of")
146 break
149def extract_note_section(
150 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
151) -> None:
152 has_list = False
153 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
154 has_list = True
155 note = clean_node(wxr, word_entry, list_item.children)
156 if note != "":
157 word_entry.notes.append(note)
158 if not has_list:
159 note = clean_node(
160 wxr,
161 word_entry,
162 list(
163 level_node.invert_find_child(
164 LEVEL_KIND_FLAGS, include_empty_str=True
165 )
166 ),
167 )
168 if note != "":
169 word_entry.notes.append(note)
172def extract_lb_template(
173 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
174) -> None:
175 text = clean_node(wxr, sense, t_node).strip("() ")
176 for raw_tag in text.split(","):
177 raw_tag = raw_tag.strip()
178 if raw_tag != "": 178 ↛ 176line 178 didn't jump to line 176 because the condition on line 178 was always true
179 sense.raw_tags.append(raw_tag)