Coverage for src/wiktextract/extractor/de/inflection.py: 92%

127 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1import re 

2from dataclasses import dataclass 

3 

4from wikitextprocessor import NodeKind, TemplateNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .flexion import parse_flexion_page 

9from .models import Form, WordEntry 

10from .tags import translate_raw_tags 

11 

12 

13def extract_inf_table_template( 

14 wxr: WiktextractContext, 

15 word_entry: WordEntry, 

16 template_node: TemplateNode, 

17) -> None: 

18 if template_node.template_name.endswith("Substantiv Übersicht"): 

19 process_noun_table(wxr, word_entry, template_node) 

20 elif template_node.template_name.endswith("Adjektiv Übersicht"): 

21 process_adj_table(wxr, word_entry, template_node) 

22 elif template_node.template_name.endswith("Verb Übersicht"): 22 ↛ exitline 22 didn't return from function 'extract_inf_table_template' because the condition on line 22 was always true

23 process_verb_table(wxr, word_entry, template_node) 

24 

25 

26@dataclass 

27class RowspanHeader: 

28 text: str 

29 index: int 

30 span: int 

31 

32 

33def process_verb_table( 

34 wxr: WiktextractContext, 

35 word_entry: WordEntry, 

36 template_node: TemplateNode, 

37) -> None: 

38 # Vorlage:Deutsch Verb Übersicht 

39 expanded_template = wxr.wtp.parse( 

40 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

41 ) 

42 table_nodes = list(expanded_template.find_child(NodeKind.TABLE)) 

43 if len(table_nodes) == 0: 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true

44 return 

45 table_node = table_nodes[0] 

46 col_headers = [] 

47 has_person = False 

48 row_headers = [] 

49 for table_row in table_node.find_child(NodeKind.TABLE_ROW): 

50 col_index = 0 

51 header_col_index = 0 

52 person = "" 

53 for table_cell in table_row.find_child( 

54 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

55 ): 

56 cell_text = clean_node(wxr, None, table_cell) 

57 if cell_text.startswith("All other forms:"): 

58 for link_node in table_cell.find_child_recursively( 

59 NodeKind.LINK 

60 ): 

61 parse_flexion_page( 

62 wxr, word_entry, clean_node(wxr, None, link_node) 

63 ) 

64 elif table_cell.kind == NodeKind.TABLE_HEADER_CELL: 

65 if cell_text == "": 

66 continue 

67 elif header_col_index == 0: 

68 rowspan = int(table_cell.attrs.get("rowspan", "1")) 

69 row_headers.append(RowspanHeader(cell_text, 0, rowspan)) 

70 elif cell_text in ("Person", "Wortform"): 

71 has_person = True 

72 else: # new table 

73 col_headers.append(cell_text) 

74 has_person = False 

75 person = "" 

76 header_col_index += 1 

77 elif table_cell.kind == NodeKind.TABLE_CELL: 77 ↛ 53line 77 didn't jump to line 53 because the condition on line 77 was always true

78 if has_person and col_index == 0: 

79 if cell_text in ("Singular", "Plural"): 

80 row_headers.append(RowspanHeader(cell_text, 0, 1)) 

81 else: 

82 person = cell_text 

83 else: 

84 for cell_line in cell_text.splitlines(): 

85 cell_line = cell_line.strip() 

86 if cell_line == "": 86 ↛ 87line 86 didn't jump to line 87 because the condition on line 86 was never true

87 continue 

88 for p in person.split(","): 

89 p = p.strip() 

90 form_text = cell_line 

91 if p != "": 

92 form_text = p + " " + cell_line 

93 form = Form(form=form_text) 

94 if col_index < len(col_headers): 

95 form.raw_tags.append(col_headers[col_index]) 

96 for row_header in row_headers: 

97 form.raw_tags.append(row_header.text) 

98 translate_raw_tags(form) 

99 word_entry.forms.append(form) 

100 col_index += 1 

101 

102 new_row_headers = [] 

103 for row_header in row_headers: 

104 if row_header.span > 1: 

105 row_header.span -= 1 

106 new_row_headers.append(row_header) 

107 row_headers = new_row_headers 

108 

109 

110def process_noun_table( 

111 wxr: WiktextractContext, 

112 word_entry: WordEntry, 

113 template_node: TemplateNode, 

114) -> None: 

115 # Vorlage:Deutsch Substantiv Übersicht 

116 expanded_template = wxr.wtp.parse( 

117 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

118 ) 

119 table_nodes = list(expanded_template.find_child(NodeKind.TABLE)) 

120 if len(table_nodes) == 0: 120 ↛ 121line 120 didn't jump to line 121 because the condition on line 120 was never true

121 return 

122 table_node = table_nodes[0] 

123 column_headers = [] 

124 for table_row in table_node.find_child(NodeKind.TABLE_ROW): 

125 row_header = "" 

126 is_header_row = not table_row.contain_node(NodeKind.TABLE_CELL) 

127 for col_index, table_cell in enumerate( 

128 table_row.find_child( 

129 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

130 ) 

131 ): 

132 cell_text = clean_node(wxr, None, table_cell) 

133 if table_cell.kind == NodeKind.TABLE_HEADER_CELL: 

134 if is_header_row: 

135 column_headers.append(re.sub(r"\s*\d+$", "", cell_text)) 

136 else: 

137 row_header = cell_text 

138 else: 

139 for form_text in cell_text.splitlines(): 

140 form = Form(form=form_text) 

141 if len(row_header) > 0: 141 ↛ 143line 141 didn't jump to line 143 because the condition on line 141 was always true

142 form.raw_tags.append(row_header) 

143 if col_index < len(column_headers): 143 ↛ 145line 143 didn't jump to line 145 because the condition on line 143 was always true

144 form.raw_tags.append(column_headers[col_index]) 

145 if form.form not in ["—", "", "?"]: 145 ↛ 139line 145 didn't jump to line 139 because the condition on line 145 was always true

146 translate_raw_tags(form) 

147 word_entry.forms.append(form) 

148 

149 clean_node(wxr, word_entry, expanded_template) # category links 

150 

151 

152def process_adj_table( 

153 wxr: WiktextractContext, 

154 word_entry: WordEntry, 

155 template_node: TemplateNode, 

156) -> None: 

157 # Vorlage:Deutsch Adjektiv Übersicht 

158 expanded_template = wxr.wtp.parse( 

159 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

160 ) 

161 table_nodes = list(expanded_template.find_child(NodeKind.TABLE)) 

162 if len(table_nodes) == 0: 162 ↛ 163line 162 didn't jump to line 163 because the condition on line 162 was never true

163 return 

164 table_node = table_nodes[0] 

165 column_headers = [] 

166 for table_row in table_node.find_child(NodeKind.TABLE_ROW): 

167 for col_index, table_cell in enumerate( 

168 table_row.find_child( 

169 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

170 ) 

171 ): 

172 cell_text = clean_node(wxr, None, table_cell) 

173 # because {{int:}} magic word is not implemented 

174 # template "Textbaustein-Intl" expands to English words 

175 if cell_text.startswith("All other forms:"): 

176 for link_node in table_cell.find_child(NodeKind.LINK): 

177 parse_flexion_page( 

178 wxr, word_entry, clean_node(wxr, None, link_node) 

179 ) 

180 elif table_cell.kind == NodeKind.TABLE_HEADER_CELL: 

181 column_headers.append(cell_text) 

182 else: 

183 for form_text in cell_text.splitlines(): 

184 if form_text in ("—", "", "?"): 184 ↛ 185line 184 didn't jump to line 185 because the condition on line 184 was never true

185 continue 

186 form = Form(form=form_text) 

187 if col_index < len(column_headers): 187 ↛ 189line 187 didn't jump to line 189 because the condition on line 187 was always true

188 form.raw_tags.append(column_headers[col_index]) 

189 translate_raw_tags(form) 

190 word_entry.forms.append(form)