Coverage for src/wiktextract/extractor/cs/pos.py: 94%
75 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1from wikitextprocessor.parser import (
2 LEVEL_KIND_FLAGS,
3 LevelNode,
4 NodeKind,
5 TemplateNode,
6 WikiNode,
7)
9from ...page import clean_node
10from ...wxr_context import WiktextractContext
11from .example import extract_example_list_item
12from .models import AltForm, Sense, WordEntry
13from .section_titles import POS_DATA
14from .tags import translate_raw_tags
17def extract_pos_section(
18 wxr: WiktextractContext,
19 page_data: list[WordEntry],
20 base_data: WordEntry,
21 level_node: LevelNode,
22 pos_title: str,
23):
24 page_data.append(base_data.model_copy(deep=True))
25 page_data[-1].pos_title = pos_title
26 pos_data = POS_DATA[pos_title]
27 page_data[-1].pos = pos_data["pos"]
28 base_data.pos = pos_data["pos"]
29 page_data[-1].tags.extend(pos_data.get("tags", []))
31 for list_node in level_node.find_child(NodeKind.LIST):
32 if list_node.sarg != "*":
33 continue
34 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
35 for italic_node in list_item.find_child(NodeKind.ITALIC):
36 italic_str = clean_node(wxr, None, italic_node)
37 for raw_tag in italic_str.split():
38 if raw_tag not in ["", "rod"]:
39 page_data[-1].raw_tags.append(raw_tag)
41 translate_raw_tags(page_data[-1])
44def extract_sense_section(
45 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
46):
47 for list_node in level_node.find_child(NodeKind.LIST):
48 if list_node.sarg != "#":
49 continue
50 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
51 extract_gloss_list_item(wxr, word_entry, list_item)
54def extract_gloss_list_item(
55 wxr: WiktextractContext,
56 word_entry: WordEntry,
57 list_item: WikiNode,
58 parent_sense: Sense | None = None,
59):
60 sense = (
61 parent_sense.model_copy(deep=True)
62 if parent_sense is not None
63 else Sense()
64 )
65 gloss_nodes = []
66 for node in list_item.children:
67 if isinstance(node, TemplateNode) and node.template_name == "Příznaky":
68 extract_příznaky_template(wxr, sense, node)
69 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
70 raw_tags = clean_node(wxr, None, node)
71 if raw_tags.startswith("(") and raw_tags.endswith(")"):
72 for raw_tag in raw_tags.strip("() ").split(","):
73 raw_tag = raw_tag.strip()
74 if raw_tag != "": 74 ↛ 72line 74 didn't jump to line 72 because the condition on line 74 was always true
75 sense.raw_tags.append(raw_tag)
76 elif node.contain_node(NodeKind.LINK): 76 ↛ 92line 76 didn't jump to line 92 because the condition on line 76 was always true
77 gloss_nodes.append(node)
78 link_nodes = list(
79 node.find_child(NodeKind.LINK, with_index=True)
80 )
81 if (
82 len(link_nodes) == 1
83 and link_nodes[0][0] != 0
84 and link_nodes[0][0] == len(node.children) - 1
85 ):
86 word = clean_node(wxr, None, link_nodes[0][1])
87 if word != "": 87 ↛ 90line 87 didn't jump to line 90 because the condition on line 87 was always true
88 sense.form_of.append(AltForm(word=word))
89 sense.tags.append("form-of")
90 break
91 else:
92 gloss_nodes.append(node)
93 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):
94 gloss_nodes.append(node)
96 gloss = clean_node(wxr, sense, gloss_nodes)
97 if gloss != "": 97 ↛ 102line 97 didn't jump to line 102 because the condition on line 97 was always true
98 sense.glosses.append(gloss)
99 translate_raw_tags(sense)
100 word_entry.senses.append(sense)
102 for child_list in list_item.find_child(NodeKind.LIST):
103 if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"):
104 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
105 extract_gloss_list_item(wxr, word_entry, child_list_item, sense)
106 elif child_list.sarg.startswith("#") and child_list.sarg.endswith( 106 ↛ 102line 106 didn't jump to line 102 because the condition on line 106 was always true
107 (":", "*")
108 ):
109 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
110 extract_example_list_item(wxr, sense, child_list_item)
113def extract_příznaky_template(
114 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
115):
116 # https://cs.wiktionary.org/wiki/Šablona:Příznaky
117 text = clean_node(wxr, sense, t_node).strip("() ")
118 for raw_tag in text.split(","):
119 raw_tag = raw_tag.strip()
120 if raw_tag != "": 120 ↛ 118line 120 didn't jump to line 118 because the condition on line 120 was always true
121 sense.raw_tags.append(raw_tag)
124def extract_note_section(
125 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
126):
127 word_entry.note = clean_node(
128 wxr,
129 word_entry,
130 list(level_node.invert_find_child(LEVEL_KIND_FLAGS, True)),
131 )