Coverage for src / wiktextract / extractor / simple / table.py: 62%

69 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-02 00:27 +0000

1from itertools import chain 

2 

3from wikitextprocessor import NodeKind, TemplateNode, WikiNode 

4 

5# from wikitextprocessor.parser import print_tree 

6from wiktextract.page import clean_node 

7from wiktextract.wxr_context import WiktextractContext 

8from wiktextract.wxr_logging import logger 

9 

10from .models import Form, WordEntry 

11from .simple_tags import simple_tag_map 

12from .tags_utils import convert_tags 

13 

14# Shorthand for this file. Could be an import, but it's so simple... 

15Node = str | WikiNode 

16 

17 

18# node_fns are different from template_fns. template_fns are functions that 

19# are used to handle how to expand (and otherwise process) templates, while 

20# node functions are used when turning parsed nodes into strings. 

21def cell_node_fn( 

22 node: WikiNode, 

23) -> list[Node] | None: 

24 """Handle nodes in the parse_tree specially. Currently: check for italics 

25 containing the string 'none' and replace with hyphen.""" 

26 assert isinstance(node, WikiNode) 

27 if node.kind == NodeKind.ITALIC: 27 ↛ 30line 27 didn't jump to line 30 because the condition on line 27 was never true

28 # If we have italicized text 'none', like in `deviate`, turn it to "–" 

29 # XXX 'None' without italics... 

30 if ( 

31 len(node.children) == 1 

32 and isinstance(node.children[0], str) 

33 and node.children[0].strip() == "none" 

34 ): 

35 return ["–"] 

36 # This latter bit is from the default node_handler function and really 

37 # unnecessary, but in case someone puts tables inside tables... 

38 kind = node.kind 

39 if kind in { 

40 NodeKind.TABLE_CELL, 

41 NodeKind.TABLE_HEADER_CELL, 

42 }: 

43 return node.children 

44 return None 

45 

46 

47def parse_pos_table( 

48 wxr: WiktextractContext, tnode: TemplateNode, data: WordEntry 

49) -> list[Form]: 

50 """Parse inflection table. Simple English Wiktionary article POS sections 

51 start with a template that generates a table with the different inflected 

52 forms.""" 

53 assert isinstance(tnode, TemplateNode) 

54 # Expand the template into text (and all subtemplates too), then parse. 

55 tree = wxr.wtp.parse(wxr.wtp.node_to_wikitext(tnode), expand_all=True) 

56 

57 # Some debugging code: if wiktwords is passed a --inflection-tables-file 

58 # argument, we save tables to a file for debugging purposes, or for just 

59 # getting tables that can be used as test data. 

60 if wxr.config.expand_tables: 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true

61 with open(wxr.config.expand_tables, "w") as f: 

62 f.write(f"{wxr.wtp.title=}\n") 

63 text = wxr.wtp.node_to_wikitext(tree) 

64 f.write(f"{text}\n") 

65 

66 # Check if there are actually any headers, because Simple English Wiktionary 

67 # doesn't use them in these POS template tables. 

68 # Headers and non-headers in other editions can be a real headache. 

69 # Having headers is better than not, but when they're inconsistently 

70 # applied, it's a headache. 

71 for header in tree.find_child_recursively(NodeKind.TABLE_HEADER_CELL): 71 ↛ 72line 71 didn't jump to line 72 because the loop on line 71 never started

72 wxr.wtp.debug( 

73 f"POS template table has headers! {repr(header)[:255]}", 

74 sortid="simple/table/45", 

75 ) 

76 

77 # A typical SEW table has simple 2-line cells, without headers, EXCEPT 

78 # some have actual table structure like "did". That's why we do thing 

79 # row-by-row. 

80 column_hdrs: dict[int, str] = {} 

81 forms: list[Form] = [] 

82 for row in chain( 

83 # This just combines these two (mostly mutually incomplementary) 

84 # calls into one list, with an expectation that we get a list of only 

85 # WikiNodes or HTML nodes. If they're mixed up, that's super weird. It's 

86 # a hack! 

87 tree.find_child_recursively(NodeKind.TABLE_ROW), 

88 tree.find_html_recursively("tr"), 

89 ): 

90 # If the row has an active header (left to right). 

91 row_hdr = "" 

92 for i, cell in chain( 

93 row.find_child(NodeKind.TABLE_CELL, with_index=True), 

94 row.find_html("td", with_index=True, attr_name="", attr_value=""), 

95 ): 

96 text = clean_node( 

97 wxr, data, cell, node_handler_fn=cell_node_fn 

98 ).strip() 

99 if not text: 

100 # In case there's an empty cell on the first row. 

101 if i not in column_hdrs: 101 ↛ 103line 101 didn't jump to line 103 because the condition on line 101 was always true

102 column_hdrs[i] = "" 

103 continue 

104 lines = [s.strip() for s in text.splitlines()] 

105 if len(lines) != 2: 105 ↛ 108line 105 didn't jump to line 108 because the condition on line 105 was never true

106 # SEW style: a single cell, first line is the 'header', 

107 # second is the form/data. 

108 logger.debug( 

109 f"{wxr.wtp.title}: A cell that's " 

110 f"not exactly 2 lines: {repr(text)}" 

111 ) 

112 if len(lines) == 1: 112 ↛ 114line 112 didn't jump to line 114 because the condition on line 112 was never true

113 # XXX do tag parsing instead of i == 0; Levenshtein. 

114 if text in simple_tag_map: 

115 # Found something that looks like a tag. 

116 if i == 0: 

117 row_hdr = text 

118 column_hdrs[i] = text 

119 else: 

120 tags = [] 

121 if i in column_hdrs and column_hdrs[i]: 

122 tags.append(column_hdrs[i]) 

123 if row_hdr: 

124 tags.append(row_hdr) 

125 forms.append(Form(form=text, raw_tags=tags)) 

126 # Add a single line cell as a column header and trust it 

127 # will be overridden as appropriate 

128 # Only applicable to Simple English wiktionary! 

129 column_hdrs[i] = text 

130 

131 continue 

132 if len(lines) == 2: 132 ↛ 92line 132 didn't jump to line 92 because the condition on line 132 was always true

133 # Default assumption. 

134 column_hdrs[i] = lines[0] 

135 cell_content = lines[1] 

136 tags = [] 

137 if column_hdrs[i]: 137 ↛ 139line 137 didn't jump to line 139 because the condition on line 137 was always true

138 tags.append(column_hdrs[i]) 

139 if row_hdr: 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true

140 tags.append(row_hdr) 

141 forms.append(Form(form=cell_content, raw_tags=tags)) 

142 # Ignore cells with more than two lines. 

143 

144 # logger.debug( 

145 # f"{wxr.wtp.title}\n{print_tree(tree, indent=2, ret_value=True)}" 

146 # ) 

147 # print(forms) 

148 

149 # Replace raw_tags with tags if appropriate 

150 for form in forms: 

151 legit_tags, new_raw_tags, poses = convert_tags(form.raw_tags) 

152 # XXX poses are strings like "adj 1", used in pronunciation data 

153 # to later associate sound data with the correct pos entry. 

154 # Not useful or common here? 

155 # if len(poses) > 0: # This spams the logs 

156 # wxr.wtp.warning(f"convert_tags() returned weird `poses` data for " 

157 # f"forms: {poses=}", sortid="simple/table/122") 

158 if legit_tags: 158 ↛ 150line 158 didn't jump to line 150 because the condition on line 158 was always true

159 form.tags = legit_tags 

160 form.raw_tags = new_raw_tags 

161 

162 return forms