Coverage for src/wiktextract/extractor/th/linkage.py: 88%

92 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1from itertools import count 

2 

3from wikitextprocessor.parser import ( 

4 LEVEL_KIND_FLAGS, 

5 LevelNode, 

6 NodeKind, 

7 TemplateNode, 

8 WikiNode, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from .models import Linkage, WordEntry 

14from .section_titles import LINKAGE_SECTIONS 

15 

16 

17def extract_linkage_section( 

18 wxr: WiktextractContext, 

19 word_entry: WordEntry, 

20 level_node: LevelNode, 

21 linkage_type: str, 

22 source: str = "", 

23) -> None: 

24 for node in level_node.children: 

25 if isinstance(node, TemplateNode) and node.template_name.startswith( 

26 "col" 

27 ): 

28 extract_col_template(wxr, word_entry, node, linkage_type, source) 

29 elif isinstance(node, TemplateNode) and node.template_name == "ws": 

30 extract_ws_template(wxr, word_entry, node, linkage_type, source) 

31 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

32 for list_item in node.find_child(NodeKind.LIST_ITEM): 

33 extract_linkage_list_item( 

34 wxr, word_entry, list_item, linkage_type, source 

35 ) 

36 

37 

38def extract_col_template( 

39 wxr: WiktextractContext, 

40 word_entry: WordEntry, 

41 t_node: TemplateNode, 

42 linkage_type: str, 

43 source: str, 

44) -> None: 

45 expanded_node = wxr.wtp.parse( 

46 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

47 ) 

48 for li_tag in expanded_node.find_html_recursively("li"): 

49 l_data = [] 

50 for span_tag in li_tag.find_html("span"): 

51 span_class = span_tag.attrs.get("class", "") 

52 if "Latn" in span_class: 

53 for data in l_data: 

54 data.roman = clean_node(wxr, None, span_tag) 

55 elif "lang" in span_tag.attrs: 

56 word = clean_node(wxr, None, span_tag) 

57 if word != "": 57 ↛ 50line 57 didn't jump to line 50 because the condition on line 57 was always true

58 l_data.append(Linkage(word=word, source=source)) 

59 if span_class == "Hant": 

60 l_data[-1].tags.append("Traditional-Chinese") 

61 elif span_class == "Hans": 

62 l_data[-1].tags.append("Simplified-Chinese") 

63 getattr(word_entry, linkage_type).extend(l_data) 

64 

65 

66def extract_linkage_list_item( 

67 wxr: WiktextractContext, 

68 word_entry: WordEntry, 

69 list_item: WikiNode, 

70 linkage_type: str, 

71 source: str, 

72) -> None: 

73 linkages = [] 

74 

75 for index, node in enumerate(list_item.children): 

76 if isinstance(node, TemplateNode) and node.template_name == "l": 

77 l_data = Linkage( 

78 word=clean_node(wxr, None, node.template_parameters.get(2, "")), 

79 source=source, 

80 ) 

81 if l_data.word != "": 81 ↛ 75line 81 didn't jump to line 75 because the condition on line 81 was always true

82 linkages.append(l_data) 

83 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: 

84 for link_node in node.find_child(NodeKind.LINK): 

85 link_str = clean_node(wxr, None, link_node) 

86 if link_str.startswith("อรรถาภิธาน:") and not source.startswith( 86 ↛ 84line 86 didn't jump to line 84 because the condition on line 86 was always true

87 "อรรถาภิธาน:" 

88 ): 

89 extract_thesaurus_page( 

90 wxr, word_entry, linkage_type, link_str 

91 ) 

92 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

93 link_str = clean_node(wxr, None, node) 

94 if link_str != "": 94 ↛ 75line 94 didn't jump to line 75 because the condition on line 94 was always true

95 linkages.append(Linkage(word=link_str)) 

96 elif isinstance(node, str) and ("-" in node or "–" in node): 

97 if "-" in node: 97 ↛ 99line 97 didn't jump to line 99 because the condition on line 97 was always true

98 sense = node[node.index("-") + 1 :] 

99 elif "–" in node: 

100 sense = node[node.index("–") + 1 :] 

101 sense = clean_node( 

102 wxr, 

103 None, 

104 [sense] + list_item.children[index + 1 :], 

105 ).strip() 

106 for l_data in linkages: 

107 l_data.sense = sense 

108 break 

109 

110 getattr(word_entry, linkage_type).extend(linkages) 

111 

112 

113def extract_thesaurus_page( 

114 wxr: WiktextractContext, 

115 word_entry: WordEntry, 

116 linkage_type: str, 

117 page_title: str, 

118) -> None: 

119 page = wxr.wtp.get_page(page_title, 110) 

120 if page is None or page.body is None: 120 ↛ 121line 120 didn't jump to line 121 because the condition on line 120 was never true

121 return 

122 root = wxr.wtp.parse(page.body) 

123 for level2_node in root.find_child(NodeKind.LEVEL2): 

124 lang_name = clean_node(wxr, None, level2_node.largs).removeprefix( 

125 "ภาษา" 

126 ) 

127 if lang_name != word_entry.lang: 127 ↛ 128line 127 didn't jump to line 128 because the condition on line 127 was never true

128 continue 

129 for level3_node in level2_node.find_child(NodeKind.LEVEL3): 

130 pos_title = clean_node(wxr, None, level3_node.largs) 

131 if pos_title != word_entry.pos_title: 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true

132 continue 

133 for linkage_level_node in level3_node.find_child_recursively( 

134 LEVEL_KIND_FLAGS 

135 ): 

136 linkage_title = clean_node(wxr, None, linkage_level_node.largs) 

137 if LINKAGE_SECTIONS.get(linkage_title) != linkage_type: 

138 continue 

139 extract_linkage_section( 

140 wxr, 

141 word_entry, 

142 linkage_level_node, 

143 linkage_type, 

144 page_title, 

145 ) 

146 

147 

148def extract_ws_template( 

149 wxr: WiktextractContext, 

150 word_entry: WordEntry, 

151 t_node: TemplateNode, 

152 linkage_type: str, 

153 source: str, 

154) -> None: 

155 word = clean_node(wxr, None, t_node.template_parameters.get(2, "")) 

156 if word != "": 156 ↛ exitline 156 didn't return from function 'extract_ws_template' because the condition on line 156 was always true

157 l_data = Linkage(word=word, source=source) 

158 getattr(word_entry, linkage_type).append(l_data) 

159 

160 

161LINKAGE_TEMPLATES = { 

162 "syn": "synonyms", 

163 "synonyms": "synonyms", 

164 "synsee": "synonyms", 

165 "ant": "antonyms", 

166 "antonyms": "antonyms", 

167 "cot": "coordinate_terms", 

168 "coordinate terms": "coordinate_terms", 

169 "hyper": "hypernyms", 

170 "hypernyms": "hypernyms", 

171 "hypo": "hyponyms", 

172 "hyponyms": "hyponyms", 

173} 

174 

175 

176def extract_syn_template( 

177 wxr: WiktextractContext, 

178 word_entry: WordEntry, 

179 t_node: TemplateNode, 

180 linkage_type: str, 

181) -> None: 

182 for arg_name in count(2): 182 ↛ exitline 182 didn't return from function 'extract_syn_template' because the loop on line 182 didn't complete

183 if arg_name not in t_node.template_parameters: 

184 break 

185 arg_value = clean_node(wxr, None, t_node.template_parameters[arg_name]) 

186 if arg_value.startswith("อรรถาภิธาน:"): 186 ↛ 187line 186 didn't jump to line 187 because the condition on line 186 was never true

187 extract_thesaurus_page(wxr, word_entry, linkage_type, arg_value) 

188 elif arg_value != "": 188 ↛ 182line 188 didn't jump to line 182 because the condition on line 188 was always true

189 getattr(word_entry, linkage_type).append(Linkage(word=arg_value))