Coverage for src/wiktextract/extractor/ms/linkage.py: 86%

92 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1from collections import defaultdict 

2 

3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .models import Form, Linkage, WordEntry 

8from .section_titles import LINKAGE_SECTIONS 

9 

10 

11def extract_form_section( 

12 wxr: WiktextractContext, 

13 word_entry: WordEntry, 

14 level_node: LevelNode, 

15 tags: list[str], 

16) -> None: 

17 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LINK): 

18 if ( 18 ↛ 17line 18 didn't jump to line 17 because the condition on line 18 was always true

19 isinstance(node, TemplateNode) 

20 and node.template_name in ["ARchar", "Arab", "PSchar", "SDchar"] 

21 ) or node.kind == NodeKind.LINK: 

22 word = clean_node(wxr, None, node) 

23 if word != "": 23 ↛ 17line 23 didn't jump to line 17 because the condition on line 23 was always true

24 word_entry.forms.append(Form(form=word, tags=tags)) 

25 for list_node in level_node.find_child(NodeKind.LIST): 25 ↛ 26line 25 didn't jump to line 26 because the loop on line 25 never started

26 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

27 for node in list_item.find_child(NodeKind.LINK): 

28 word = clean_node(wxr, None, node) 

29 if word != "": 

30 word_entry.forms.append(Form(form=word, tags=tags)) 

31 

32 

33def extract_linkage_section( 

34 wxr: WiktextractContext, 

35 page_data: list[WordEntry], 

36 base_data: WordEntry, 

37 level_node: LevelNode, 

38) -> None: 

39 l_dict = defaultdict(list) 

40 linkage_name = clean_node(wxr, None, level_node.largs).lower() 

41 for list_node in level_node.find_child(NodeKind.LIST): 

42 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

43 new_l_name = extract_linkage_list_item( 

44 wxr, l_dict, linkage_name, list_item 

45 ) 

46 if new_l_name != "": 46 ↛ 42line 46 didn't jump to line 42 because the condition on line 46 was always true

47 linkage_name = new_l_name 

48 

49 if len(page_data) == 0 or page_data[-1].lang_code != base_data.lang_code: 

50 for field, data in l_dict.items(): 

51 getattr(base_data, field).extend(data) 

52 elif level_node.kind == NodeKind.LEVEL3: 

53 for data in page_data: 

54 if data.lang_code == page_data[-1].lang_code: 54 ↛ 53line 54 didn't jump to line 53 because the condition on line 54 was always true

55 for field, l_data in l_dict.items(): 

56 getattr(data, field).extend(l_data) 

57 else: 

58 for field, l_data in l_dict.items(): 

59 getattr(page_data[-1], field).extend(l_data) 

60 

61 

62def extract_linkage_list_item( 

63 wxr: WiktextractContext, 

64 l_dict: dict[str, list[Linkage]], 

65 linkage_name: str, 

66 list_item: WikiNode, 

67) -> str: 

68 if list_item.definition is not None and len(list_item.definition) > 0: 

69 linkage_name = clean_node(wxr, None, list_item.children).lower() 

70 if linkage_name not in LINKAGE_SECTIONS: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true

71 return "" 

72 for node in list_item.definition: 

73 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

74 word = clean_node(wxr, None, node) 

75 if word != "": 75 ↛ 72line 75 didn't jump to line 72 because the condition on line 75 was always true

76 l_dict[LINKAGE_SECTIONS[linkage_name]].append( 

77 Linkage(word=word) 

78 ) 

79 elif isinstance(node, str): 79 ↛ 72line 79 didn't jump to line 72 because the condition on line 79 was always true

80 for word in node.split(","): 

81 word = word.strip(" .\n") 

82 if word != "": 

83 l_dict[LINKAGE_SECTIONS[linkage_name]].append( 

84 Linkage(word=word) 

85 ) 

86 elif list_item.contain_node(NodeKind.BOLD): 

87 extract_proverb_list( 

88 wxr, l_dict, list_item, LINKAGE_SECTIONS[linkage_name] 

89 ) 

90 else: 

91 sense = "" 

92 for node in list_item.children: 

93 if isinstance(node, TemplateNode) and node.template_name == "sense": 

94 sense = clean_node(wxr, None, node).strip("(): ") 

95 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

96 word = clean_node(wxr, None, node) 

97 if word != "" and linkage_name in LINKAGE_SECTIONS: 97 ↛ 92line 97 didn't jump to line 92 because the condition on line 97 was always true

98 l_dict[LINKAGE_SECTIONS[linkage_name]].append( 

99 Linkage(word=word, sense=sense) 

100 ) 

101 elif isinstance(node, str) and node.strip().endswith(":"): 

102 new_linkage_name = node.strip("(): ").lower() 

103 if new_linkage_name in LINKAGE_SECTIONS: 103 ↛ 92line 103 didn't jump to line 92 because the condition on line 103 was always true

104 linkage_name = new_linkage_name 

105 

106 return linkage_name 

107 

108 

109LINKAGE_TEMPLATES = { 

110 "antonim": "antonyms", 

111 "ant": "antonyms", 

112 "antonyms": "antonyms", 

113 "sinonim": "synonyms", 

114 "synonyms": "synonyms", 

115 "syn": "synonyms", 

116 "sin": "synonyms", 

117 "hypernyms": "hypernyms", 

118 "hyper": "hypernyms", 

119 "kata setara": "coordinate_terms", 

120 "coordinate terms": "coordinate_terms", 

121 "perkataan koordinat": "coordinate_terms", 

122 "cot": "coordinate_terms", 

123 "hiponim": "hyponyms", 

124 "hipo": "hyponyms", 

125 "hyponyms": "hyponyms", 

126} 

127 

128 

129def extract_nyms_template( 

130 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

131): 

132 # Modul:nyms 

133 expanded_node = wxr.wtp.parse( 

134 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

135 ) 

136 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

137 for span_tag in expanded_node.find_html_recursively("span"): 

138 if lang_code == span_tag.attrs.get("lang", ""): 

139 word = clean_node(wxr, None, span_tag) 

140 if word != "": 140 ↛ 137line 140 didn't jump to line 137 because the condition on line 140 was always true

141 l_data = Linkage(word=word) 

142 if ( 142 ↛ 147line 142 didn't jump to line 147 because the condition on line 142 was always true

143 len(word_entry.senses) > 0 

144 and len(word_entry.senses[-1].glosses) > 0 

145 ): 

146 l_data.sense = " ".join(word_entry.senses[-1].glosses) 

147 getattr( 

148 word_entry, LINKAGE_TEMPLATES[t_node.template_name] 

149 ).append(l_data) 

150 

151 

152def extract_proverb_list( 

153 wxr: WiktextractContext, 

154 l_dict: dict[str, list[Linkage]], 

155 list_item: WikiNode, 

156 linkage_type: str, 

157) -> None: 

158 proverbs = [] 

159 after_bold = False 

160 sense = "" 

161 for index, node in enumerate(list_item.children): 

162 if isinstance(node, WikiNode) and node.kind == NodeKind.BOLD: 

163 proverb = clean_node(wxr, None, node) 

164 if proverb != "": 164 ↛ 166line 164 didn't jump to line 166 because the condition on line 164 was always true

165 proverbs.append(proverb) 

166 after_bold = True 

167 elif after_bold and isinstance(node, str) and ":" in node: 

168 sense = clean_node( 

169 wxr, 

170 None, 

171 [node[node.index(":") + 1 :]] + list_item.children[index + 1 :], 

172 ) 

173 for proverb in proverbs: 

174 l_dict[linkage_type].append(Linkage(word=proverb, sense=sense))