Coverage for src/wiktextract/extractor/ms/linkage.py: 86%

92 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1from collections import defaultdict 

2 

3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .models import Form, Linkage, WordEntry 

8from .section_titles import LINKAGE_SECTIONS 

9 

10 

11def extract_form_section( 

12 wxr: WiktextractContext, 

13 word_entry: WordEntry, 

14 level_node: LevelNode, 

15 tags: list[str], 

16) -> None: 

17 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LINK): 

18 if ( 18 ↛ 17line 18 didn't jump to line 17 because the condition on line 18 was always true

19 isinstance(node, TemplateNode) 

20 and node.template_name in ["ARchar", "Arab", "PSchar", "SDchar"] 

21 ) or node.kind == NodeKind.LINK: 

22 word = clean_node(wxr, None, node) 

23 if word != "": 23 ↛ 17line 23 didn't jump to line 17 because the condition on line 23 was always true

24 word_entry.forms.append(Form(form=word, tags=tags)) 

25 for list_node in level_node.find_child(NodeKind.LIST): 25 ↛ 26line 25 didn't jump to line 26 because the loop on line 25 never started

26 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

27 for node in list_item.find_child(NodeKind.LINK): 

28 word = clean_node(wxr, None, node) 

29 if word != "": 

30 word_entry.forms.append(Form(form=word, tags=tags)) 

31 

32 

33def extract_linkage_section( 

34 wxr: WiktextractContext, 

35 page_data: list[WordEntry], 

36 base_data: WordEntry, 

37 level_node: LevelNode, 

38) -> None: 

39 l_dict = defaultdict(list) 

40 linkage_name = clean_node(wxr, None, level_node.largs).lower() 

41 for list_node in level_node.find_child(NodeKind.LIST): 

42 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

43 new_l_name = extract_linkage_list_item( 

44 wxr, l_dict, linkage_name, list_item 

45 ) 

46 if new_l_name != "": 46 ↛ 42line 46 didn't jump to line 42 because the condition on line 46 was always true

47 linkage_name = new_l_name 

48 

49 if len(page_data) == 0 or page_data[-1].lang_code != base_data.lang_code: 

50 for field, data in l_dict.items(): 

51 getattr(base_data, field).extend(data) 

52 elif level_node.kind == NodeKind.LEVEL3: 

53 for data in page_data: 

54 if data.lang_code == page_data[-1].lang_code: 54 ↛ 53line 54 didn't jump to line 53 because the condition on line 54 was always true

55 for field, l_data in l_dict.items(): 

56 getattr(data, field).extend(l_data) 

57 else: 

58 for field, l_data in l_dict.items(): 

59 getattr(page_data[-1], field).extend(l_data) 

60 

61 

62def extract_linkage_list_item( 

63 wxr: WiktextractContext, 

64 l_dict: dict[str, list[Linkage]], 

65 linkage_name: str, 

66 list_item: WikiNode, 

67) -> str: 

68 if list_item.definition is not None and len(list_item.definition) > 0: 

69 linkage_name = clean_node(wxr, None, list_item.children).lower() 

70 if linkage_name not in LINKAGE_SECTIONS: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true

71 return "" 

72 for node in list_item.definition: 

73 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

74 word = clean_node(wxr, None, node) 

75 if word != "": 75 ↛ 72line 75 didn't jump to line 72 because the condition on line 75 was always true

76 l_dict[LINKAGE_SECTIONS[linkage_name]].append( 

77 Linkage(word=word) 

78 ) 

79 elif isinstance(node, str): 79 ↛ 72line 79 didn't jump to line 72 because the condition on line 79 was always true

80 for word in node.split(","): 

81 word = word.strip(" .\n") 

82 if word != "": 

83 l_dict[LINKAGE_SECTIONS[linkage_name]].append( 

84 Linkage(word=word) 

85 ) 

86 elif ( 

87 list_item.contain_node(NodeKind.BOLD) 

88 and linkage_name in LINKAGE_SECTIONS 

89 ): 

90 extract_proverb_list( 

91 wxr, l_dict, list_item, LINKAGE_SECTIONS[linkage_name] 

92 ) 

93 else: 

94 sense = "" 

95 for node in list_item.children: 

96 if isinstance(node, TemplateNode) and node.template_name == "sense": 

97 sense = clean_node(wxr, None, node).strip("(): ") 

98 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

99 word = clean_node(wxr, None, node) 

100 if word != "" and linkage_name in LINKAGE_SECTIONS: 100 ↛ 95line 100 didn't jump to line 95 because the condition on line 100 was always true

101 l_dict[LINKAGE_SECTIONS[linkage_name]].append( 

102 Linkage(word=word, sense=sense) 

103 ) 

104 elif isinstance(node, str) and node.strip().endswith(":"): 

105 new_linkage_name = node.strip("(): ").lower() 

106 if new_linkage_name in LINKAGE_SECTIONS: 106 ↛ 95line 106 didn't jump to line 95 because the condition on line 106 was always true

107 linkage_name = new_linkage_name 

108 

109 return linkage_name 

110 

111 

112LINKAGE_TEMPLATES = { 

113 "antonim": "antonyms", 

114 "ant": "antonyms", 

115 "antonyms": "antonyms", 

116 "sinonim": "synonyms", 

117 "synonyms": "synonyms", 

118 "syn": "synonyms", 

119 "sin": "synonyms", 

120 "hypernyms": "hypernyms", 

121 "hyper": "hypernyms", 

122 "kata setara": "coordinate_terms", 

123 "coordinate terms": "coordinate_terms", 

124 "perkataan koordinat": "coordinate_terms", 

125 "cot": "coordinate_terms", 

126 "hiponim": "hyponyms", 

127 "hipo": "hyponyms", 

128 "hyponyms": "hyponyms", 

129} 

130 

131 

132def extract_nyms_template( 

133 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

134): 

135 # Modul:nyms 

136 expanded_node = wxr.wtp.parse( 

137 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

138 ) 

139 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

140 for span_tag in expanded_node.find_html_recursively("span"): 

141 if lang_code == span_tag.attrs.get("lang", ""): 

142 word = clean_node(wxr, None, span_tag) 

143 if word != "": 143 ↛ 140line 143 didn't jump to line 140 because the condition on line 143 was always true

144 l_data = Linkage(word=word) 

145 if ( 145 ↛ 150line 145 didn't jump to line 150 because the condition on line 145 was always true

146 len(word_entry.senses) > 0 

147 and len(word_entry.senses[-1].glosses) > 0 

148 ): 

149 l_data.sense = " ".join(word_entry.senses[-1].glosses) 

150 getattr( 

151 word_entry, LINKAGE_TEMPLATES[t_node.template_name] 

152 ).append(l_data) 

153 

154 

155def extract_proverb_list( 

156 wxr: WiktextractContext, 

157 l_dict: dict[str, list[Linkage]], 

158 list_item: WikiNode, 

159 linkage_type: str, 

160) -> None: 

161 proverbs = [] 

162 after_bold = False 

163 sense = "" 

164 for index, node in enumerate(list_item.children): 

165 if isinstance(node, WikiNode) and node.kind == NodeKind.BOLD: 

166 proverb = clean_node(wxr, None, node) 

167 if proverb != "": 167 ↛ 169line 167 didn't jump to line 169 because the condition on line 167 was always true

168 proverbs.append(proverb) 

169 after_bold = True 

170 elif after_bold and isinstance(node, str) and ":" in node: 

171 sense = clean_node( 

172 wxr, 

173 None, 

174 [node[node.index(":") + 1 :]] + list_item.children[index + 1 :], 

175 ) 

176 for proverb in proverbs: 

177 l_dict[linkage_type].append(Linkage(word=proverb, sense=sense))