Coverage for src/wiktextract/extractor/th/linkage.py: 88%

93 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-12 08:27 +0000

1from itertools import count 

2 

3from wikitextprocessor.parser import ( 

4 LEVEL_KIND_FLAGS, 

5 LevelNode, 

6 NodeKind, 

7 TemplateNode, 

8 WikiNode, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from .models import Linkage, WordEntry 

14from .section_titles import LINKAGE_SECTIONS 

15 

16 

17def extract_linkage_section( 

18 wxr: WiktextractContext, 

19 word_entry: WordEntry, 

20 level_node: LevelNode, 

21 linkage_type: str, 

22 source: str = "", 

23 sense: str = "", 

24) -> None: 

25 for node in level_node.children: 

26 if isinstance(node, TemplateNode) and node.template_name.startswith( 

27 "col" 

28 ): 

29 extract_col_template( 

30 wxr, word_entry, node, linkage_type, source, sense 

31 ) 

32 elif isinstance(node, TemplateNode) and node.template_name == "ws": 

33 extract_ws_template( 

34 wxr, word_entry, node, linkage_type, source, sense 

35 ) 

36 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

37 for list_item in node.find_child(NodeKind.LIST_ITEM): 

38 extract_linkage_list_item( 

39 wxr, word_entry, list_item, linkage_type, source, sense 

40 ) 

41 

42 

43def extract_col_template( 

44 wxr: WiktextractContext, 

45 word_entry: WordEntry, 

46 t_node: TemplateNode, 

47 linkage_type: str, 

48 source: str, 

49 sense: str, 

50) -> None: 

51 expanded_node = wxr.wtp.parse( 

52 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

53 ) 

54 for li_tag in expanded_node.find_html_recursively("li"): 

55 l_data = [] 

56 for span_tag in li_tag.find_html("span"): 

57 span_class = span_tag.attrs.get("class", "") 

58 if "Latn" in span_class: 

59 for data in l_data: 

60 data.roman = clean_node(wxr, None, span_tag) 

61 elif "lang" in span_tag.attrs: 

62 word = clean_node(wxr, None, span_tag) 

63 if word != "": 63 ↛ 56line 63 didn't jump to line 56 because the condition on line 63 was always true

64 l_data.append( 

65 Linkage(word=word, source=source, sense=sense) 

66 ) 

67 if span_class == "Hant": 

68 l_data[-1].tags.append("Traditional-Chinese") 

69 elif span_class == "Hans": 

70 l_data[-1].tags.append("Simplified-Chinese") 

71 getattr(word_entry, linkage_type).extend(l_data) 

72 

73 

74def extract_linkage_list_item( 

75 wxr: WiktextractContext, 

76 word_entry: WordEntry, 

77 list_item: WikiNode, 

78 linkage_type: str, 

79 source: str, 

80 sense: str, 

81) -> None: 

82 linkages = [] 

83 

84 for index, node in enumerate(list_item.children): 

85 if isinstance(node, TemplateNode) and node.template_name == "l": 

86 l_data = Linkage( 

87 word=clean_node(wxr, None, node.template_parameters.get(2, "")), 

88 source=source, 

89 sense=sense, 

90 ) 

91 if l_data.word != "": 91 ↛ 84line 91 didn't jump to line 84 because the condition on line 91 was always true

92 linkages.append(l_data) 

93 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: 

94 for link_node in node.find_child(NodeKind.LINK): 

95 link_str = clean_node(wxr, None, link_node) 

96 if link_str.startswith("อรรถาภิธาน:") and not source.startswith( 96 ↛ 94line 96 didn't jump to line 94 because the condition on line 96 was always true

97 "อรรถาภิธาน:" 

98 ): 

99 extract_thesaurus_page( 

100 wxr, word_entry, linkage_type, link_str, sense 

101 ) 

102 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

103 link_str = clean_node(wxr, None, node) 

104 if link_str != "": 104 ↛ 84line 104 didn't jump to line 84 because the condition on line 104 was always true

105 linkages.append(Linkage(word=link_str, sense=sense)) 

106 elif isinstance(node, str) and ("-" in node or "–" in node): 

107 if "-" in node: 107 ↛ 109line 107 didn't jump to line 109 because the condition on line 107 was always true

108 sense = node[node.index("-") + 1 :] 

109 elif "–" in node: 

110 sense = node[node.index("–") + 1 :] 

111 sense = clean_node( 

112 wxr, 

113 None, 

114 [sense] + list_item.children[index + 1 :], 

115 ).strip() 

116 for l_data in linkages: 

117 l_data.sense = sense 

118 break 

119 

120 getattr(word_entry, linkage_type).extend(linkages) 

121 

122 

123def extract_thesaurus_page( 

124 wxr: WiktextractContext, 

125 word_entry: WordEntry, 

126 linkage_type: str, 

127 page_title: str, 

128 sense: str, 

129) -> None: 

130 page = wxr.wtp.get_page(page_title, 110) 

131 if page is None or page.body is None: 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true

132 return 

133 root = wxr.wtp.parse(page.body) 

134 for level2_node in root.find_child(NodeKind.LEVEL2): 

135 lang_name = clean_node(wxr, None, level2_node.largs).removeprefix( 

136 "ภาษา" 

137 ) 

138 if lang_name != word_entry.lang: 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true

139 continue 

140 for level3_node in level2_node.find_child(NodeKind.LEVEL3): 

141 pos_title = clean_node(wxr, None, level3_node.largs) 

142 if pos_title != word_entry.pos_title: 142 ↛ 143line 142 didn't jump to line 143 because the condition on line 142 was never true

143 continue 

144 for linkage_level_node in level3_node.find_child_recursively( 

145 LEVEL_KIND_FLAGS 

146 ): 

147 linkage_title = clean_node(wxr, None, linkage_level_node.largs) 

148 if LINKAGE_SECTIONS.get(linkage_title) != linkage_type: 

149 continue 

150 extract_linkage_section( 

151 wxr, 

152 word_entry, 

153 linkage_level_node, 

154 linkage_type, 

155 source=page_title, 

156 sense=sense, 

157 ) 

158 

159 

160def extract_ws_template( 

161 wxr: WiktextractContext, 

162 word_entry: WordEntry, 

163 t_node: TemplateNode, 

164 linkage_type: str, 

165 source: str, 

166 sense: str, 

167) -> None: 

168 word = clean_node(wxr, None, t_node.template_parameters.get(2, "")) 

169 if word != "": 169 ↛ exitline 169 didn't return from function 'extract_ws_template' because the condition on line 169 was always true

170 l_data = Linkage(word=word, source=source, sense=sense) 

171 getattr(word_entry, linkage_type).append(l_data) 

172 

173 

174LINKAGE_TEMPLATES = { 

175 "syn": "synonyms", 

176 "synonyms": "synonyms", 

177 "synsee": "synonyms", 

178 "ant": "antonyms", 

179 "antonyms": "antonyms", 

180 "cot": "coordinate_terms", 

181 "coordinate terms": "coordinate_terms", 

182 "hyper": "hypernyms", 

183 "hypernyms": "hypernyms", 

184 "hypo": "hyponyms", 

185 "hyponyms": "hyponyms", 

186} 

187 

188 

189def extract_syn_template( 

190 wxr: WiktextractContext, 

191 word_entry: WordEntry, 

192 t_node: TemplateNode, 

193 linkage_type: str, 

194) -> None: 

195 sense = " ".join(word_entry.senses[-1].glosses) 

196 for arg_name in count(2): 196 ↛ exitline 196 didn't return from function 'extract_syn_template' because the loop on line 196 didn't complete

197 if arg_name not in t_node.template_parameters: 

198 break 

199 arg_value = clean_node(wxr, None, t_node.template_parameters[arg_name]) 

200 if arg_value.startswith("อรรถาภิธาน:"): 200 ↛ 201line 200 didn't jump to line 201 because the condition on line 200 was never true

201 extract_thesaurus_page( 

202 wxr, word_entry, linkage_type, arg_value, sense 

203 ) 

204 elif arg_value != "": 204 ↛ 196line 204 didn't jump to line 196 because the condition on line 204 was always true

205 getattr(word_entry, linkage_type).append( 

206 Linkage(word=arg_value, sense=sense) 

207 )