Coverage for src/wiktextract/extractor/simple/table.py: 62%

69 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1from itertools import chain 

2 

3from wikitextprocessor import NodeKind, TemplateNode, WikiNode 

4 

5# from wikitextprocessor.parser import print_tree 

6from wiktextract.page import clean_node 

7from wiktextract.wxr_context import WiktextractContext 

8from wiktextract.wxr_logging import logger 

9 

10from .models import Form, WordEntry 

11from .simple_tags import simple_tag_map 

12from .tags_utils import convert_tags 

13 

14# Shorthand for this file. Could be an import, but it's so simple... 

15Node = str | WikiNode 

16 

17 

18 

19# node_fns are different from template_fns. template_fns are functions that 

20# are used to handle how to expand (and otherwise process) templates, while 

21# node functions are used when turning parsed nodes into strings. 

22def cell_node_fn( 

23 node: WikiNode, 

24) -> list[Node] | None: 

25 """Handle nodes in the parse_tree specially. Currently: check for italics 

26 containing the string 'none' and replace with hyphen.""" 

27 assert isinstance(node, WikiNode) 

28 if node.kind == NodeKind.ITALIC: 28 ↛ 31line 28 didn't jump to line 31 because the condition on line 28 was never true

29 # If we have italicized text 'none', like in `deviate`, turn it to "–" 

30 # XXX 'None' without italics... 

31 if ( 

32 len(node.children) == 1 

33 and isinstance(node.children[0], str) 

34 and node.children[0].strip() == "none" 

35 ): 

36 return ["–"] 

37 # This latter bit is from the default node_handler function and really 

38 # unnecessary, but in case someone puts tables inside tables... 

39 kind = node.kind 

40 if kind in { 

41 NodeKind.TABLE_CELL, 

42 NodeKind.TABLE_HEADER_CELL, 

43 }: 

44 return node.children 

45 return None 

46 

47 

48def parse_pos_table( 

49 wxr: WiktextractContext, tnode: TemplateNode, data: WordEntry 

50) -> list[Form]: 

51 """Parse inflection table. Simple English Wiktionary article POS sections 

52 start with a template that generates a table with the different inflected 

53 forms.""" 

54 assert isinstance(tnode, TemplateNode) 

55 # Expand the template into text (and all subtemplates too), then parse. 

56 tree = wxr.wtp.parse(wxr.wtp.node_to_wikitext(tnode), expand_all=True) 

57 

58 # Some debugging code: if wiktwords is passed a --inflection-tables-file 

59 # argument, we save tables to a file for debugging purposes, or for just 

60 # getting tables that can be used as test data. 

61 if wxr.config.expand_tables: 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true

62 with open(wxr.config.expand_tables, "w") as f: 

63 f.write(f"{wxr.wtp.title=}\n") 

64 text = wxr.wtp.node_to_wikitext(tree) 

65 f.write(f"{text}\n") 

66 

67 # Check if there are actually any headers, because Simple English Wiktionary 

68 # doesn't use them in these POS template tables. 

69 # Headers and non-headers in other editions can be a real headache. 

70 # Having headers is better than not, but when they're inconsistenly applied, 

71 # it's a headache. 

72 for header in tree.find_child_recursively(NodeKind.TABLE_HEADER_CELL): 72 ↛ 73line 72 didn't jump to line 73 because the loop on line 72 never started

73 wxr.wtp.debug( 

74 f"POS template table has headers! {repr(header)[:255]}", 

75 sortid="simple/table/45", 

76 ) 

77 

78 # A typical SEW table has simple 2-line cells, without headers, EXCEPT 

79 # some have actual table structure like "did". That's why we do thing 

80 # row-by-row. 

81 column_hdrs: dict[int, str] = {} 

82 forms: list[Form] = [] 

83 for row in chain( 

84 # This just combines these two (mostly mutually incomplementary) 

85 # calls into one list, with an expectation that we get a list of only 

86 # WikiNodes or HTML nodes. If they're mixed up, that's super weird. It's 

87 # a hack! 

88 tree.find_child_recursively(NodeKind.TABLE_ROW), 

89 tree.find_html_recursively("tr"), 

90 ): 

91 # If the row has an active header (left to right). 

92 row_hdr = "" 

93 for i, cell in chain( 

94 row.find_child(NodeKind.TABLE_CELL, with_index=True), 

95 row.find_html("td", with_index=True, attr_name="", attr_value=""), 

96 ): 

97 text = clean_node( 

98 wxr, data, cell, node_handler_fn=cell_node_fn 

99 ).strip() 

100 if not text: 

101 # In case there's an empty cell on the first row. 

102 if i not in column_hdrs: 102 ↛ 104line 102 didn't jump to line 104 because the condition on line 102 was always true

103 column_hdrs[i] = "" 

104 continue 

105 lines = [s.strip() for s in text.splitlines()] 

106 if len(lines) != 2: 106 ↛ 109line 106 didn't jump to line 109 because the condition on line 106 was never true

107 # SEW style: a single cell, first line is the 'header', 

108 # second is the form/data. 

109 logger.debug( 

110 f"{wxr.wtp.title}: A cell that's " 

111 f"not exactly 2 lines: {repr(text)}" 

112 ) 

113 if len(lines) == 1: 113 ↛ 115line 113 didn't jump to line 115 because the condition on line 113 was never true

114 # XXX do tag parsing instead of i == 0; Levenshtein. 

115 if text in simple_tag_map: 

116 # Found something that looks like a tag. 

117 if i == 0: 

118 row_hdr = text 

119 column_hdrs[i] = text 

120 else: 

121 tags = [] 

122 if i in column_hdrs and column_hdrs[i]: 

123 tags.append(column_hdrs[i]) 

124 if row_hdr: 

125 tags.append(row_hdr) 

126 forms.append(Form(form=text, raw_tags=tags)) 

127 # Add a single line cell as a column header and trust it 

128 # will be overridden as appropriate 

129 # Only applicable to Simple English wiktionary! 

130 column_hdrs[i] = text 

131 

132 continue 

133 if len(lines) == 2: 133 ↛ 93line 133 didn't jump to line 93 because the condition on line 133 was always true

134 # Default assumption. 

135 column_hdrs[i] = lines[0] 

136 cell_content = lines[1] 

137 tags = [] 

138 if column_hdrs[i]: 138 ↛ 140line 138 didn't jump to line 140 because the condition on line 138 was always true

139 tags.append(column_hdrs[i]) 

140 if row_hdr: 140 ↛ 141line 140 didn't jump to line 141 because the condition on line 140 was never true

141 tags.append(row_hdr) 

142 forms.append(Form(form=cell_content, raw_tags=tags)) 

143 # Ignore cells with more than two lines. 

144 

145 # logger.debug( 

146 # f"{wxr.wtp.title}\n{print_tree(tree, indent=2, ret_value=True)}" 

147 # ) 

148 # print(forms) 

149 

150 # Replace raw_tags with tags if appropriate 

151 for form in forms: 

152 legit_tags, new_raw_tags, poses = convert_tags(form.raw_tags) 

153 # XXX poses are strings like "adj 1", used in pronunciation data 

154 # to later associate sound data with the correct pos entry. 

155 # Not useful or common here? 

156 # if len(poses) > 0: # This spams the logs 

157 # wxr.wtp.warning(f"convert_tags() returned weird `poses` data for " 

158 # f"forms: {poses=}", sortid="simple/table/122") 

159 if legit_tags: 159 ↛ 151line 159 didn't jump to line 151 because the condition on line 159 was always true

160 form.tags = legit_tags 

161 form.raw_tags = new_raw_tags 

162 

163 return forms