Coverage for src / wiktextract / extractor / cs / declension.py: 95%
87 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
1from dataclasses import dataclass
3from wikitextprocessor import LevelNode, NodeKind, TemplateNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from ..share import capture_text_in_parentheses
8from .models import Form, WordEntry
9from .tags import translate_raw_tags
11DECLENSION_SECTION_TAGS = {
12 "skloňování mužské": ["masculine"],
13 "skloňování ženské": ["feminine"],
14}
17def extract_declension_section(
18 wxr: WiktextractContext,
19 word_entry: WordEntry,
20 level_node: LevelNode,
21 section_title: str,
22):
23 section_tags = DECLENSION_SECTION_TAGS.get(section_title, [])
24 for t_node in level_node.find_child(NodeKind.TEMPLATE):
25 if t_node.template_name == "Sloveso (ja)":
26 extract_sloveso_ja_template(wxr, word_entry, t_node, section_tags)
27 elif t_node.template_name.startswith( 27 ↛ 24line 27 didn't jump to line 24 because the condition on line 27 was always true
28 ("Substantivum ", "Adjektivum ", "Stupňování ", "Sloveso ")
29 ):
30 extract_substantivum_template(wxr, word_entry, t_node, section_tags)
33@dataclass
34class TableHeader:
35 text: str
36 colspan: int
37 rowspan: int
38 col_index: int
39 row_index: int
42def extract_substantivum_template(
43 wxr: WiktextractContext,
44 word_entry: WordEntry,
45 t_node: TemplateNode,
46 section_tags: list[str],
47):
48 # https://cs.wiktionary.org/wiki/Šablona:Substantivum_(cs)
49 expanded_node = wxr.wtp.parse(
50 wxr.wtp.node_to_wikitext(t_node), expand_all=True
51 )
52 clean_node(wxr, word_entry, expanded_node)
53 for table in expanded_node.find_child(NodeKind.TABLE):
54 col_headers = []
55 row_headers = []
56 table_caption = ""
57 for caption_node in table.find_child(NodeKind.TABLE_CAPTION):
58 table_caption = clean_node(wxr, None, caption_node.children)
59 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)):
60 is_column_header = not row.contain_node(NodeKind.TABLE_CELL)
61 col_index = 0
62 for header in col_headers if is_column_header else row_headers:
63 if (
64 header.rowspan > 1
65 and header.row_index <= row_index
66 and header.row_index + header.rowspan > row_index
67 and header.col_index <= col_index
68 ):
69 col_index += header.colspan
70 for cell in row.find_child(
71 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
72 ):
73 cell_text = clean_node(wxr, None, cell)
74 colspan = int(cell.attrs.get("colspan", "1"))
75 rowspan = int(cell.attrs.get("rowspan", "1"))
76 if cell.kind == NodeKind.TABLE_HEADER_CELL:
77 if is_column_header:
78 col_headers.append(
79 TableHeader(
80 cell_text,
81 colspan,
82 rowspan,
83 col_index,
84 row_index,
85 )
86 )
87 elif not is_column_header: 87 ↛ 131line 87 didn't jump to line 131 because the condition on line 87 was always true
88 row_headers.append(
89 TableHeader(
90 cell_text,
91 colspan,
92 rowspan,
93 col_index,
94 row_index,
95 )
96 )
97 else:
98 words = (
99 filter(None, map(str.strip, cell_text.split("/")))
100 if cell_text.count("/") == 1
101 else [cell_text]
102 )
103 for word in words:
104 cell_tags, word = capture_text_in_parentheses(word)
105 word = word.strip()
106 if word in ["", "—", wxr.wtp.title]:
107 continue
108 form = Form(
109 form=word, tags=section_tags, raw_tags=cell_tags
110 )
111 if table_caption != "":
112 form.raw_tags.append(table_caption)
113 for row_header in row_headers:
114 if (
115 row_header.text != ""
116 and row_header.row_index < row_index + rowspan
117 and row_header.row_index + row_header.rowspan
118 > row_index
119 ):
120 form.raw_tags.append(row_header.text)
121 for col_header in col_headers:
122 if (
123 col_header.text != ""
124 and col_header.col_index < col_index + colspan
125 and col_header.col_index + col_header.colspan
126 > col_index
127 ):
128 form.raw_tags.append(col_header.text)
129 translate_raw_tags(form)
130 word_entry.forms.append(form)
131 col_index += colspan
134def extract_sloveso_ja_template(
135 wxr: WiktextractContext,
136 word_entry: WordEntry,
137 t_node: TemplateNode,
138 section_tags: list[str],
139):
140 expanded_node = wxr.wtp.parse(
141 wxr.wtp.node_to_wikitext(t_node), expand_all=True
142 )
143 forms = []
144 for table in expanded_node.find_child(NodeKind.TABLE):
145 for row in table.find_child(NodeKind.TABLE_ROW):
146 row_header = ""
147 for col_index, cell in enumerate( 147 ↛ 145line 147 didn't jump to line 145 because the loop on line 147 didn't complete
148 row.find_child(NodeKind.TABLE_CELL | NodeKind.TABLE_HEADER_CELL)
149 ):
150 if col_index >= 3:
151 break
152 if cell.kind == NodeKind.TABLE_HEADER_CELL:
153 row_header = clean_node(wxr, None, cell)
154 elif cell.kind == NodeKind.TABLE_CELL: 154 ↛ 147line 154 didn't jump to line 147 because the condition on line 154 was always true
155 if col_index == 1:
156 word = clean_node(wxr, None, cell)
157 if word not in ["", wxr.wtp.title]: 157 ↛ 147line 157 didn't jump to line 147 because the condition on line 157 was always true
158 form = Form(form=word, raw_tags=section_tags)
159 if row_header != "": 159 ↛ 161line 159 didn't jump to line 161 because the condition on line 159 was always true
160 form.raw_tags.append(row_header)
161 translate_raw_tags(form)
162 forms.append(form)
163 elif col_index == 2 and len(forms) > 0: 163 ↛ 147line 163 didn't jump to line 147 because the condition on line 163 was always true
164 forms[-1].roman = clean_node(wxr, None, cell)
166 word_entry.forms.extend(forms)