Coverage for src/wiktextract/extractor/cs/declension.py: 98%
61 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1from dataclasses import dataclass
3from wikitextprocessor import LevelNode, NodeKind, TemplateNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from ..share import capture_text_in_parentheses
8from .models import Form, WordEntry
9from .tags import translate_raw_tags
11DECLENSION_SECTION_TAGS = {
12 "skloňování mužské": ["masculine"],
13 "skloňování ženské": ["feminine"],
14}
17def extract_declension_section(
18 wxr: WiktextractContext,
19 word_entry: WordEntry,
20 level_node: LevelNode,
21 section_title: str,
22):
23 section_tags = DECLENSION_SECTION_TAGS.get(section_title, [])
24 for t_node in level_node.find_child(NodeKind.TEMPLATE):
25 if t_node.template_name.startswith( 25 ↛ 24line 25 didn't jump to line 24 because the condition on line 25 was always true
26 ("Substantivum ", "Adjektivum ", "Stupňování ", "Sloveso ")
27 ):
28 extract_substantivum_template(wxr, word_entry, t_node, section_tags)
31@dataclass
32class TableHeader:
33 text: str
34 colspan: int
35 rowspan: int
36 col_index: int
37 row_index: int
40def extract_substantivum_template(
41 wxr: WiktextractContext,
42 word_entry: WordEntry,
43 t_node: TemplateNode,
44 section_tags: list[str],
45):
46 # https://cs.wiktionary.org/wiki/Šablona:Substantivum_(cs)
47 expanded_node = wxr.wtp.parse(
48 wxr.wtp.node_to_wikitext(t_node), expand_all=True
49 )
50 clean_node(wxr, word_entry, expanded_node)
51 for table in expanded_node.find_child(NodeKind.TABLE):
52 col_headers = []
53 row_headers = []
54 table_caption = ""
55 for caption_node in table.find_child(NodeKind.TABLE_CAPTION):
56 table_caption = clean_node(wxr, None, caption_node.children)
57 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)):
58 is_column_header = not row.contain_node(NodeKind.TABLE_CELL)
59 col_index = 0
60 for header in col_headers if is_column_header else row_headers:
61 if (
62 header.rowspan > 1
63 and header.row_index <= row_index
64 and header.row_index + header.rowspan > row_index
65 ):
66 col_index += header.colspan
67 for cell in row.find_child(
68 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
69 ):
70 cell_text = clean_node(wxr, None, cell)
71 colspan = int(cell.attrs.get("colspan", "1"))
72 rowspan = int(cell.attrs.get("rowspan", "1"))
73 if cell.kind == NodeKind.TABLE_HEADER_CELL:
74 if is_column_header:
75 col_headers.append(
76 TableHeader(
77 cell_text,
78 colspan,
79 rowspan,
80 col_index,
81 row_index,
82 )
83 )
84 elif not is_column_header: 84 ↛ 123line 84 didn't jump to line 123 because the condition on line 84 was always true
85 row_headers.append(
86 TableHeader(
87 cell_text,
88 colspan,
89 rowspan,
90 col_index,
91 row_index,
92 )
93 )
94 else:
95 for word in cell_text.split(" / "):
96 cell_tags, word = capture_text_in_parentheses(word)
97 word = word.strip()
98 if word in ["", "—", wxr.wtp.title]:
99 continue
100 form = Form(
101 form=word, tags=section_tags, raw_tags=cell_tags
102 )
103 if table_caption != "":
104 form.raw_tags.append(table_caption)
105 for row_header in row_headers:
106 if (
107 row_header.text != ""
108 and row_header.row_index <= row_index
109 and row_header.row_index + row_header.rowspan
110 > row_index
111 ):
112 form.raw_tags.append(row_header.text)
113 for col_header in col_headers:
114 if (
115 col_header.text != ""
116 and col_header.col_index <= col_index
117 and col_header.col_index + col_header.colspan
118 > col_index
119 ):
120 form.raw_tags.append(col_header.text)
121 translate_raw_tags(form)
122 word_entry.forms.append(form)
123 col_index += colspan