Coverage for src/wiktextract/extractor/ja/pos.py: 82%
99 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
1from wikitextprocessor.parser import (
2 LEVEL_KIND_FLAGS,
3 LevelNode,
4 NodeKind,
5 TemplateNode,
6 WikiNode,
7)
9from ...page import clean_node
10from ...wxr_context import WiktextractContext
11from ..ruby import extract_ruby
12from .example import extract_example_list_item
13from .header import extract_header_nodes
14from .models import AltForm, Sense, WordEntry
15from .section_titles import POS_DATA
16from .tags import translate_raw_tags
19def parse_pos_section(
20 wxr: WiktextractContext,
21 page_data: list[WordEntry],
22 base_data: WordEntry,
23 level_node: LevelNode,
24 pos_title: str,
25) -> None:
26 from .conjugation import extract_conjugation_section
28 page_data.append(base_data.model_copy(deep=True))
29 page_data[-1].pos_title = pos_title
30 pos_data = POS_DATA[pos_title]
31 page_data[-1].pos = pos_data["pos"]
32 page_data[-1].tags.extend(pos_data.get("tags", []))
34 gloss_list_start = 0
35 for list_index, list_node in level_node.find_child(NodeKind.LIST, True):
36 if not list_node.sarg.endswith("#"): # linkage list
37 continue
38 if gloss_list_start == 0: 38 ↛ 40line 38 didn't jump to line 40 because the condition on line 38 was always true
39 gloss_list_start = list_index
40 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
41 process_gloss_list_item(wxr, page_data[-1], list_item)
42 extract_header_nodes(
43 wxr, page_data[-1], level_node.children[:gloss_list_start]
44 )
45 old_forms_len = len(page_data[-1].forms)
46 extract_conjugation_section(wxr, page_data[-1], level_node)
47 if gloss_list_start == 0 and len(page_data[-1].forms) == old_forms_len:
48 page_data.pop()
51def process_gloss_list_item(
52 wxr: WiktextractContext,
53 word_entry: WordEntry,
54 list_item_node: WikiNode,
55 parent_gloss: str = "",
56) -> None:
57 gloss_nodes = list(
58 list_item_node.invert_find_child(NodeKind.LIST, include_empty_str=True)
59 )
60 sense_data = Sense()
61 find_form_of_data(wxr, word_entry, sense_data, list_item_node)
62 if len(parent_gloss) > 0:
63 sense_data.glosses.append(parent_gloss)
64 gloss_only_nodes = []
65 for gloss_node in gloss_nodes:
66 if isinstance(gloss_node, TemplateNode):
67 if gloss_node.template_name in ("context", "タグ"):
68 # https://ja.wiktionary.org/wiki/テンプレート:context
69 # https://ja.wiktionary.org/wiki/テンプレート:タグ
70 for raw_tag in (
71 clean_node(wxr, sense_data, gloss_node)
72 .strip("()")
73 .split(",")
74 ):
75 raw_tag = raw_tag.strip()
76 if len(raw_tag) > 0: 76 ↛ 70line 76 didn't jump to line 70 because the condition on line 76 was always true
77 sense_data.raw_tags.append(raw_tag)
78 elif gloss_node.template_name == "wikipedia-s":
79 expanded_text = clean_node(wxr, None, gloss_node)
80 gloss_only_nodes.append(
81 expanded_text.removesuffix("⁽ʷᵖ⁾").strip()
82 )
83 elif gloss_node.template_name == "wp": 83 ↛ 84line 83 didn't jump to line 84 because the condition on line 83 was never true
84 continue
85 elif gloss_node.template_name == "lb":
86 extract_lb_template(wxr, sense_data, gloss_node)
87 else:
88 gloss_only_nodes.append(gloss_node)
89 else:
90 gloss_only_nodes.append(gloss_node)
91 expanded_gloss = wxr.wtp.parse(
92 wxr.wtp.node_to_wikitext(gloss_only_nodes), expand_all=True
93 )
94 ruby, no_ruby = extract_ruby(wxr, expanded_gloss.children)
95 gloss_text = clean_node(wxr, sense_data, no_ruby)
96 sense_data.ruby = ruby
97 if len(gloss_text) > 0: 97 ↛ 102line 97 didn't jump to line 102 because the condition on line 97 was always true
98 sense_data.glosses.append(gloss_text)
99 translate_raw_tags(sense_data)
100 word_entry.senses.append(sense_data)
102 for nest_gloss_list in list_item_node.find_child(NodeKind.LIST):
103 if nest_gloss_list.sarg.endswith(("*", ":")):
104 for example_list_item in nest_gloss_list.find_child(
105 NodeKind.LIST_ITEM
106 ):
107 extract_example_list_item(
108 wxr, word_entry, sense_data, example_list_item
109 )
110 elif nest_gloss_list.sarg.endswith("#"): 110 ↛ 102line 110 didn't jump to line 102 because the condition on line 110 was always true
111 for nest_list_item in nest_gloss_list.find_child(
112 NodeKind.LIST_ITEM
113 ):
114 process_gloss_list_item(
115 wxr, word_entry, nest_list_item, gloss_text
116 )
119def find_form_of_data(
120 wxr: WiktextractContext,
121 word_entry: WordEntry,
122 sense: Sense,
123 list_item_node: WikiNode,
124) -> None:
125 for node in list_item_node.find_child(NodeKind.TEMPLATE):
126 if node.template_name.endswith(" of"):
127 expanded_node = wxr.wtp.parse(
128 wxr.wtp.node_to_wikitext(node), expand_all=True
129 )
130 for link_node in expanded_node.find_child_recursively( 130 ↛ 125line 130 didn't jump to line 125 because the loop on line 130 didn't complete
131 NodeKind.LINK
132 ):
133 form_of = clean_node(wxr, None, link_node)
134 if form_of != "": 134 ↛ 130line 134 didn't jump to line 130 because the condition on line 134 was always true
135 sense.form_of.append(AltForm(word=form_of))
136 break
137 if "form-of" in word_entry.tags and len(sense.form_of) == 0:
138 for link_node in list_item_node.find_child(NodeKind.LINK): 138 ↛ exitline 138 didn't return from function 'find_form_of_data' because the loop on line 138 didn't complete
139 form_of = clean_node(wxr, None, link_node)
140 if form_of != "": 140 ↛ 138line 140 didn't jump to line 138 because the condition on line 140 was always true
141 sense.form_of.append(AltForm(word=form_of))
142 break
145def extract_note_section(
146 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
147) -> None:
148 has_list = False
149 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
150 has_list = True
151 note = clean_node(wxr, word_entry, list_item.children)
152 if note != "":
153 word_entry.notes.append(note)
154 if not has_list:
155 note = clean_node(
156 wxr,
157 word_entry,
158 list(level_node.invert_find_child(LEVEL_KIND_FLAGS)),
159 )
160 if note != "":
161 word_entry.notes.append(note)
164def extract_lb_template(
165 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
166) -> None:
167 text = clean_node(wxr, sense, t_node).strip("() ")
168 for raw_tag in text.split(","):
169 raw_tag = raw_tag.strip()
170 if raw_tag != "": 170 ↛ 168line 170 didn't jump to line 168 because the condition on line 170 was always true
171 sense.raw_tags.append(raw_tag)