Coverage for src / wiktextract / extractor / cs / pos.py: 92%
84 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
1from wikitextprocessor.parser import (
2 LEVEL_KIND_FLAGS,
3 LevelNode,
4 NodeKind,
5 TemplateNode,
6 WikiNode,
7)
9from ...page import clean_node
10from ...wxr_context import WiktextractContext
11from .example import extract_example_list_item
12from .models import AltForm, Sense, WordEntry
13from .section_titles import POS_DATA
14from .tags import translate_raw_tags
17def extract_pos_section(
18 wxr: WiktextractContext,
19 page_data: list[WordEntry],
20 base_data: WordEntry,
21 level_node: LevelNode,
22 pos_title: str,
23):
24 page_data.append(base_data.model_copy(deep=True))
25 page_data[-1].pos_title = pos_title
26 pos_data = POS_DATA[pos_title]
27 page_data[-1].pos = pos_data["pos"]
28 base_data.pos = pos_data["pos"]
29 page_data[-1].tags.extend(pos_data.get("tags", []))
30 has_child_section = level_node.contain_node(LEVEL_KIND_FLAGS)
32 for list_node in level_node.find_child(NodeKind.LIST):
33 if list_node.sarg != "*":
34 continue
35 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
36 if has_child_section:
37 for italic_node in list_item.find_child(NodeKind.ITALIC):
38 italic_str = clean_node(wxr, None, italic_node)
39 for raw_tag in italic_str.split():
40 if raw_tag not in ["", "rod"]:
41 page_data[-1].raw_tags.append(raw_tag)
42 else:
43 for link_node in list_item.find_child(NodeKind.LINK): 43 ↛ 35line 43 didn't jump to line 35 because the loop on line 43 didn't complete
44 word = clean_node(wxr, None, link_node)
45 if word != "": 45 ↛ 43line 45 didn't jump to line 43 because the condition on line 45 was always true
46 page_data[-1].senses.append(
47 Sense(
48 glosses=[
49 clean_node(wxr, None, list_item.children)
50 ],
51 tags=["form-of"],
52 form_of=[AltForm(word=word)],
53 )
54 )
55 if "form-of" not in page_data[-1]: 55 ↛ 57line 55 didn't jump to line 57 because the condition on line 55 was always true
56 page_data[-1].tags.append("form-of")
57 break
59 translate_raw_tags(page_data[-1])
62def extract_sense_section(
63 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
64):
65 for list_node in level_node.find_child(NodeKind.LIST):
66 if list_node.sarg != "#":
67 continue
68 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
69 extract_gloss_list_item(wxr, word_entry, list_item)
72def extract_gloss_list_item(
73 wxr: WiktextractContext,
74 word_entry: WordEntry,
75 list_item: WikiNode,
76 parent_sense: Sense | None = None,
77):
78 sense = (
79 parent_sense.model_copy(deep=True)
80 if parent_sense is not None
81 else Sense()
82 )
83 gloss_nodes = []
84 for node in list_item.children:
85 if isinstance(node, TemplateNode) and node.template_name == "Příznaky":
86 extract_příznaky_template(wxr, sense, node)
87 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
88 raw_tags = clean_node(wxr, None, node)
89 if raw_tags.startswith("(") and raw_tags.endswith(")"):
90 for raw_tag in raw_tags.strip("() ").split(","):
91 raw_tag = raw_tag.strip()
92 if raw_tag != "": 92 ↛ 90line 92 didn't jump to line 90 because the condition on line 92 was always true
93 sense.raw_tags.append(raw_tag)
94 elif node.contain_node(NodeKind.LINK): 94 ↛ 110line 94 didn't jump to line 110 because the condition on line 94 was always true
95 gloss_nodes.append(node)
96 link_nodes = list(
97 node.find_child(NodeKind.LINK, with_index=True)
98 )
99 if (
100 len(link_nodes) == 1
101 and link_nodes[0][0] != 0
102 and link_nodes[0][0] == len(node.children) - 1
103 ):
104 word = clean_node(wxr, None, link_nodes[0][1])
105 if word != "": 105 ↛ 108line 105 didn't jump to line 108 because the condition on line 105 was always true
106 sense.form_of.append(AltForm(word=word))
107 sense.tags.append("form-of")
108 break
109 else:
110 gloss_nodes.append(node)
111 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):
112 gloss_nodes.append(node)
114 gloss = clean_node(wxr, sense, gloss_nodes)
115 if gloss != "": 115 ↛ 120line 115 didn't jump to line 120 because the condition on line 115 was always true
116 sense.glosses.append(gloss)
117 translate_raw_tags(sense)
118 word_entry.senses.append(sense)
120 for child_list in list_item.find_child(NodeKind.LIST):
121 if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"):
122 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
123 extract_gloss_list_item(wxr, word_entry, child_list_item, sense)
124 elif child_list.sarg.startswith("#") and child_list.sarg.endswith( 124 ↛ 120line 124 didn't jump to line 120 because the condition on line 124 was always true
125 (":", "*")
126 ):
127 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
128 extract_example_list_item(wxr, sense, child_list_item)
131def extract_příznaky_template(
132 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
133):
134 # https://cs.wiktionary.org/wiki/Šablona:Příznaky
135 text = clean_node(wxr, sense, t_node).strip("() ")
136 for raw_tag in text.split(","):
137 raw_tag = raw_tag.strip()
138 if raw_tag != "": 138 ↛ 136line 138 didn't jump to line 136 because the condition on line 138 was always true
139 sense.raw_tags.append(raw_tag)
142def extract_note_section(
143 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
144):
145 word_entry.note = clean_node(
146 wxr,
147 word_entry,
148 list(level_node.invert_find_child(LEVEL_KIND_FLAGS, True)),
149 )