Coverage for src/wiktextract/extractor/cs/declension.py: 95%

1from dataclasses import dataclass

3from wikitextprocessor import LevelNode, NodeKind, TemplateNode

5from ...page import clean_node

6from ...wxr_context import WiktextractContext

7from ..share import capture_text_in_parentheses

8from .models import Form, WordEntry

9from .tags import translate_raw_tags

11DECLENSION_SECTION_TAGS = {

12 "skloňování mužské": ["masculine"],

13 "skloňování ženské": ["feminine"],

14}

17def extract_declension_section(

18 wxr: WiktextractContext,

19 word_entry: WordEntry,

20 level_node: LevelNode,

21 section_title: str,

22):

23 section_tags = DECLENSION_SECTION_TAGS.get(section_title, [])

24 for t_node in level_node.find_child(NodeKind.TEMPLATE):

25 if t_node.template_name == "Sloveso (ja)":

26 extract_sloveso_ja_template(wxr, word_entry, t_node, section_tags)

27 elif t_node.template_name.startswith( 27 ↛ 24line 27 didn't jump to line 24 because the condition on line 27 was always true

28 ("Substantivum ", "Adjektivum ", "Stupňování ", "Sloveso ")

29 ):

30 extract_substantivum_template(wxr, word_entry, t_node, section_tags)

33@dataclass

34class TableHeader:

35 text: str

36 colspan: int

37 rowspan: int

38 col_index: int

39 row_index: int

42def extract_substantivum_template(

43 wxr: WiktextractContext,

44 word_entry: WordEntry,

45 t_node: TemplateNode,

46 section_tags: list[str],

47):

48 # https://cs.wiktionary.org/wiki/Šablona:Substantivum_(cs)

49 expanded_node = wxr.wtp.parse(

50 wxr.wtp.node_to_wikitext(t_node), expand_all=True

51 )

52 clean_node(wxr, word_entry, expanded_node)

53 for table in expanded_node.find_child(NodeKind.TABLE):

54 col_headers = []

55 row_headers = []

56 table_caption = ""

57 for caption_node in table.find_child(NodeKind.TABLE_CAPTION):

58 table_caption = clean_node(wxr, None, caption_node.children)

59 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)):

60 is_column_header = not row.contain_node(NodeKind.TABLE_CELL)

61 col_index = 0

62 for header in col_headers if is_column_header else row_headers:

63 if (

64 header.rowspan > 1

65 and header.row_index <= row_index

66 and header.row_index + header.rowspan > row_index

67 and header.col_index <= col_index

68 ):

69 col_index += header.colspan

70 for cell in row.find_child(

71 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL

72 ):

73 cell_text = clean_node(wxr, None, cell)

74 colspan = int(cell.attrs.get("colspan", "1"))

75 rowspan = int(cell.attrs.get("rowspan", "1"))

76 if cell.kind == NodeKind.TABLE_HEADER_CELL:

77 if is_column_header:

78 col_headers.append(

79 TableHeader(

80 cell_text,

81 colspan,

82 rowspan,

83 col_index,

84 row_index,

85 )

86 )

87 elif not is_column_header: 87 ↛ 131line 87 didn't jump to line 131 because the condition on line 87 was always true

88 row_headers.append(

89 TableHeader(

90 cell_text,

91 colspan,

92 rowspan,

93 col_index,

94 row_index,

95 )

96 )

97 else:

98 words = (

99 filter(None, map(str.strip, cell_text.split("/")))

100 if cell_text.count("/") == 1

101 else [cell_text]

102 )

103 for word in words:

104 cell_tags, word = capture_text_in_parentheses(word)

105 word = word.strip()

106 if word in ["", "—", wxr.wtp.title]:

107 continue

108 form = Form(

109 form=word, tags=section_tags, raw_tags=cell_tags

110 )

111 if table_caption != "":

112 form.raw_tags.append(table_caption)

113 for row_header in row_headers:

114 if (

115 row_header.text != ""

116 and row_header.row_index < row_index + rowspan

117 and row_header.row_index + row_header.rowspan

118 > row_index

119 ):

120 form.raw_tags.append(row_header.text)

121 for col_header in col_headers:

122 if (

123 col_header.text != ""

124 and col_header.col_index < col_index + colspan

125 and col_header.col_index + col_header.colspan

126 > col_index

127 ):

128 form.raw_tags.append(col_header.text)

129 translate_raw_tags(form)

130 word_entry.forms.append(form)

131 col_index += colspan

132

133

134def extract_sloveso_ja_template(

135 wxr: WiktextractContext,

136 word_entry: WordEntry,

137 t_node: TemplateNode,

138 section_tags: list[str],

139):

140 expanded_node = wxr.wtp.parse(

141 wxr.wtp.node_to_wikitext(t_node), expand_all=True

142 )

143 forms = []

144 for table in expanded_node.find_child(NodeKind.TABLE):

145 for row in table.find_child(NodeKind.TABLE_ROW):

146 row_header = ""

147 for col_index, cell in enumerate( 147 ↛ 145line 147 didn't jump to line 145 because the loop on line 147 didn't complete

148 row.find_child(NodeKind.TABLE_CELL | NodeKind.TABLE_HEADER_CELL)

149 ):

150 if col_index >= 3:

151 break

152 if cell.kind == NodeKind.TABLE_HEADER_CELL:

153 row_header = clean_node(wxr, None, cell)

154 elif cell.kind == NodeKind.TABLE_CELL: 154 ↛ 147line 154 didn't jump to line 147 because the condition on line 154 was always true

155 if col_index == 1:

156 word = clean_node(wxr, None, cell)

157 if word not in ["", wxr.wtp.title]: 157 ↛ 147line 157 didn't jump to line 147 because the condition on line 157 was always true

158 form = Form(form=word, raw_tags=section_tags)

159 if row_header != "": 159 ↛ 161line 159 didn't jump to line 161 because the condition on line 159 was always true

160 form.raw_tags.append(row_header)

161 translate_raw_tags(form)

162 forms.append(form)

163 elif col_index == 2 and len(forms) > 0: 163 ↛ 147line 163 didn't jump to line 147 because the condition on line 163 was always true

164 forms[-1].roman = clean_node(wxr, None, cell)

165

166 word_entry.forms.extend(forms)

Coverage for src / wiktextract / extractor / cs / declension.py: 95%

87 statements