Coverage for src/wiktextract/extractor/simple/table.py: 62%

1from itertools import chain

3from wikitextprocessor import NodeKind, TemplateNode, WikiNode

5# from wikitextprocessor.parser import print_tree

6from wiktextract.page import clean_node

7from wiktextract.wxr_context import WiktextractContext

8from wiktextract.wxr_logging import logger

10from .models import Form, WordEntry

11from .simple_tags import simple_tag_map

12from .tags_utils import convert_tags

14# Shorthand for this file. Could be an import, but it's so simple...

15Node = str | WikiNode

19# node_fns are different from template_fns. template_fns are functions that

20# are used to handle how to expand (and otherwise process) templates, while

21# node functions are used when turning parsed nodes into strings.

22def cell_node_fn(

23 node: WikiNode,

24) -> list[Node] | None:

25 """Handle nodes in the parse_tree specially. Currently: check for italics

26 containing the string 'none' and replace with hyphen."""

27 assert isinstance(node, WikiNode)

28 if node.kind == NodeKind.ITALIC: 28 ↛ 31line 28 didn't jump to line 31 because the condition on line 28 was never true

29 # If we have italicized text 'none', like in `deviate`, turn it to "–"

30 # XXX 'None' without italics...

31 if (

32 len(node.children) == 1

33 and isinstance(node.children[0], str)

34 and node.children[0].strip() == "none"

35 ):

36 return ["–"]

37 # This latter bit is from the default node_handler function and really

38 # unnecessary, but in case someone puts tables inside tables...

39 kind = node.kind

40 if kind in {

41 NodeKind.TABLE_CELL,

42 NodeKind.TABLE_HEADER_CELL,

43 }:

44 return node.children

45 return None

48def parse_pos_table(

49 wxr: WiktextractContext, tnode: TemplateNode, data: WordEntry

50) -> list[Form]:

51 """Parse inflection table. Simple English Wiktionary article POS sections

52 start with a template that generates a table with the different inflected

53 forms."""

54 assert isinstance(tnode, TemplateNode)

55 # Expand the template into text (and all subtemplates too), then parse.

56 tree = wxr.wtp.parse(wxr.wtp.node_to_wikitext(tnode), expand_all=True)

58 # Some debugging code: if wiktwords is passed a --inflection-tables-file

59 # argument, we save tables to a file for debugging purposes, or for just

60 # getting tables that can be used as test data.

61 if wxr.config.expand_tables: 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true

62 with open(wxr.config.expand_tables, "w") as f:

63 f.write(f"{wxr.wtp.title=}\n")

64 text = wxr.wtp.node_to_wikitext(tree)

65 f.write(f"{text}\n")

67 # Check if there are actually any headers, because Simple English Wiktionary

68 # doesn't use them in these POS template tables.

69 # Headers and non-headers in other editions can be a real headache.

70 # Having headers is better than not, but when they're inconsistenly applied,

71 # it's a headache.

72 for header in tree.find_child_recursively(NodeKind.TABLE_HEADER_CELL): 72 ↛ 73line 72 didn't jump to line 73 because the loop on line 72 never started

73 wxr.wtp.debug(

74 f"POS template table has headers! {repr(header)[:255]}",

75 sortid="simple/table/45",

76 )

78 # A typical SEW table has simple 2-line cells, without headers, EXCEPT

79 # some have actual table structure like "did". That's why we do thing

80 # row-by-row.

81 column_hdrs: dict[int, str] = {}

82 forms: list[Form] = []

83 for row in chain(

84 # This just combines these two (mostly mutually incomplementary)

85 # calls into one list, with an expectation that we get a list of only

86 # WikiNodes or HTML nodes. If they're mixed up, that's super weird. It's

87 # a hack!

88 tree.find_child_recursively(NodeKind.TABLE_ROW),

89 tree.find_html_recursively("tr"),

90 ):

91 # If the row has an active header (left to right).

92 row_hdr = ""

93 for i, cell in chain(

94 row.find_child(NodeKind.TABLE_CELL, with_index=True),

95 row.find_html("td", with_index=True, attr_name="", attr_value=""),

96 ):

97 text = clean_node(

98 wxr, data, cell, node_handler_fn=cell_node_fn

99 ).strip()

100 if not text:

101 # In case there's an empty cell on the first row.

102 if i not in column_hdrs: 102 ↛ 104line 102 didn't jump to line 104 because the condition on line 102 was always true

103 column_hdrs[i] = ""

104 continue

105 lines = [s.strip() for s in text.splitlines()]

106 if len(lines) != 2: 106 ↛ 109line 106 didn't jump to line 109 because the condition on line 106 was never true

107 # SEW style: a single cell, first line is the 'header',

108 # second is the form/data.

109 logger.debug(

110 f"{wxr.wtp.title}: A cell that's "

111 f"not exactly 2 lines: {repr(text)}"

112 )

113 if len(lines) == 1: 113 ↛ 115line 113 didn't jump to line 115 because the condition on line 113 was never true

114 # XXX do tag parsing instead of i == 0; Levenshtein.

115 if text in simple_tag_map:

116 # Found something that looks like a tag.

117 if i == 0:

118 row_hdr = text

119 column_hdrs[i] = text

120 else:

121 tags = []

122 if i in column_hdrs and column_hdrs[i]:

123 tags.append(column_hdrs[i])

124 if row_hdr:

125 tags.append(row_hdr)

126 forms.append(Form(form=text, raw_tags=tags))

127 # Add a single line cell as a column header and trust it

128 # will be overridden as appropriate

129 # Only applicable to Simple English wiktionary!

130 column_hdrs[i] = text

131

132 continue

133 if len(lines) == 2: 133 ↛ 93line 133 didn't jump to line 93 because the condition on line 133 was always true

134 # Default assumption.

135 column_hdrs[i] = lines[0]

136 cell_content = lines[1]

137 tags = []

138 if column_hdrs[i]: 138 ↛ 140line 138 didn't jump to line 140 because the condition on line 138 was always true

139 tags.append(column_hdrs[i])

140 if row_hdr: 140 ↛ 141line 140 didn't jump to line 141 because the condition on line 140 was never true

141 tags.append(row_hdr)

142 forms.append(Form(form=cell_content, raw_tags=tags))

143 # Ignore cells with more than two lines.

144

145 # logger.debug(

146 # f"{wxr.wtp.title}\n{print_tree(tree, indent=2, ret_value=True)}"

147 # )

148 # print(forms)

149

150 # Replace raw_tags with tags if appropriate

151 for form in forms:

152 legit_tags, new_raw_tags, poses = convert_tags(form.raw_tags)

153 # XXX poses are strings like "adj 1", used in pronunciation data

154 # to later associate sound data with the correct pos entry.

155 # Not useful or common here?

156 # if len(poses) > 0: # This spams the logs

157 # wxr.wtp.warning(f"convert_tags() returned weird `poses` data for "

158 # f"forms: {poses=}", sortid="simple/table/122")

159 if legit_tags: 159 ↛ 151line 159 didn't jump to line 151 because the condition on line 159 was always true

160 form.tags = legit_tags

161 form.raw_tags = new_raw_tags

162

163 return forms