Coverage for src/wiktextract/extractor/ms/pos.py: 95%

110 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

2 

3from ...page import clean_node 

4from ...wxr_context import WiktextractContext 

5from .example import extract_example_list_item 

6from .models import AltForm, Attestation, Form, Sense, WordEntry 

7from .section_titles import POS_DATA 

8from .tags import translate_raw_tags 

9 

10POS_HEADER_TEMPLATE_SUFFIXES = ( 

11 "-ks", 

12 "-adj", 

13 "-kn", 

14 "-noun", 

15 "-kk", 

16 "-verb", 

17 "-kerja", 

18 "-kgn", 

19 "-pron", 

20 "-kkt", 

21 "-adv", 

22 "-kp", 

23 "-sendi", 

24 "-prep", 

25 "-seru", 

26 "-kanji", 

27 "-hanzi", 

28 "-hanja", 

29 "-conj", 

30 "-hantu", 

31) 

32 

33FORM_OF_TEMPLATES = {"ja-perumian", "jamak", "alt case"} 

34ALT_OF_TEMPLATES = {"alt case", "alternative case form of"} 

35 

36 

37def extract_pos_section( 

38 wxr: WiktextractContext, 

39 page_data: list[WordEntry], 

40 base_data: WordEntry, 

41 level_node: LevelNode, 

42 pos_title: str, 

43) -> None: 

44 page_data.append(base_data.model_copy(deep=True)) 

45 page_data[-1].pos_title = pos_title 

46 pos_data = POS_DATA[pos_title.lower()] 

47 page_data[-1].pos = pos_data["pos"] 

48 page_data[-1].tags.extend(pos_data.get("tags", [])) 

49 

50 gloss_list_index = len(level_node.children) 

51 for index, node in enumerate(level_node.children): 

52 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

53 for list_item in node.find_child(NodeKind.LIST_ITEM): 

54 if node.sarg.startswith("#") and node.sarg.endswith("#"): 

55 extract_gloss_list_item(wxr, page_data[-1], list_item) 

56 if index < gloss_list_index: 

57 gloss_list_index = index 

58 elif isinstance(node, TemplateNode) and ( 

59 node.template_name.endswith(POS_HEADER_TEMPLATE_SUFFIXES) 

60 or node.template_name in ["inti", "head", "Han char"] 

61 ): 

62 extract_pos_header_template(wxr, page_data, base_data, node) 

63 

64 if len(page_data[-1].senses) == 0: 

65 page_data.pop() 

66 

67 

68def extract_gloss_list_item( 

69 wxr: WiktextractContext, 

70 word_entry: WordEntry, 

71 list_item: WikiNode, 

72 parent_sense: Sense | None = None, 

73) -> None: 

74 sense = ( 

75 parent_sense.model_copy(deep=True) 

76 if parent_sense is not None 

77 else Sense() 

78 ) 

79 gloss_nodes = [] 

80 for node in list_item.children: 

81 if isinstance(node, TemplateNode) and node.template_name in [ 

82 "label", 

83 "lb", 

84 "konteks", 

85 "context", 

86 "konteks 1", 

87 "context 2", 

88 ]: 

89 extract_label_template(wxr, sense, node) 

90 elif isinstance(node, TemplateNode) and node.template_name == "defdate": 

91 extract_defdate_template(wxr, sense, node) 

92 elif isinstance(node, TemplateNode) and ( 

93 node.template_name.endswith(" of") 

94 or node.template_name in FORM_OF_TEMPLATES 

95 or node.template_name in ALT_OF_TEMPLATES 

96 ): 

97 extract_form_of_template(wxr, sense, node) 

98 gloss_nodes.append(node) 

99 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST): 

100 gloss_nodes.append(node) 

101 gloss_str = clean_node(wxr, sense, gloss_nodes) 

102 if gloss_str != "": 102 ↛ 104line 102 didn't jump to line 104 because the condition on line 102 was always true

103 sense.glosses.append(gloss_str) 

104 if len(sense.glosses) > 0: 104 ↛ 108line 104 didn't jump to line 108 because the condition on line 104 was always true

105 translate_raw_tags(sense) 

106 word_entry.senses.append(sense) 

107 

108 for child_list in list_item.find_child(NodeKind.LIST): 

109 if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"): 

110 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

111 extract_gloss_list_item(wxr, word_entry, child_list_item, sense) 

112 elif child_list.sarg.startswith("#") and child_list.sarg.endswith( 112 ↛ 108line 112 didn't jump to line 108 because the condition on line 112 was always true

113 (":", "*") 

114 ): 

115 for e_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

116 extract_example_list_item(wxr, word_entry, sense, e_list_item) 

117 

118 

119def extract_pos_header_template( 

120 wxr: WiktextractContext, 

121 page_data: list[WordEntry], 

122 base_data: WordEntry, 

123 t_node: TemplateNode, 

124) -> None: 

125 cats = {} 

126 expanded_template = wxr.wtp.parse( 

127 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

128 ) 

129 for link_node in expanded_template.find_child(NodeKind.LINK): 

130 clean_node(wxr, cats, link_node) 

131 pos_type = "unknown" 

132 pos_tags = [] 

133 for cat in cats.get("categories", []): 133 ↛ 141line 133 didn't jump to line 141 because the loop on line 133 didn't complete

134 for pos_title, pos_data in POS_DATA.items(): 

135 if cat.lower().startswith(pos_title): 

136 pos_type = pos_data["pos"] 

137 pos_tags = pos_data.get("tags", []) 

138 break 

139 if pos_type != "unknown": 

140 break 

141 if page_data[-1].pos_title == "Takrifan" and page_data[-1].pos != "unknown": 

142 page_data.append(base_data.model_copy(deep=True)) 

143 page_data[-1].pos = pos_type 

144 page_data[-1].pos_title = "Takrifan" 

145 page_data[-1].tags.extend(pos_tags) 

146 if page_data[-1].pos == "unknown": 

147 page_data[-1].pos = pos_type 

148 page_data[-1].tags.extend(pos_tags) 

149 page_data[-1].categories.extend(cats.get("categories", [])) 

150 

151 raw_tag = "" 

152 for node in expanded_template.find_child_recursively(NodeKind.HTML): 

153 match node.tag: 

154 case "i": 

155 raw_tag = clean_node(wxr, None, node) 

156 case "b": 

157 form = Form(form=clean_node(wxr, None, node)) 

158 if raw_tag != "": 158 ↛ 160line 158 didn't jump to line 160 because the condition on line 158 was always true

159 form.raw_tags.append(raw_tag) 

160 if form.form != "": 160 ↛ 152line 160 didn't jump to line 152 because the condition on line 160 was always true

161 translate_raw_tags(form) 

162 page_data[-1].forms.append(form) 

163 

164 

165def extract_label_template( 

166 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

167) -> None: 

168 text = clean_node(wxr, sense, t_node).strip("() ") 

169 for raw_tag in text.split(","): 

170 raw_tag = raw_tag.strip() 

171 if raw_tag != "": 171 ↛ 169line 171 didn't jump to line 169 because the condition on line 171 was always true

172 sense.raw_tags.append(raw_tag) 

173 

174 

175def extract_form_of_template( 

176 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

177) -> None: 

178 expanded_template = wxr.wtp.parse( 

179 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

180 ) 

181 for html_tag in expanded_template.find_child_recursively(NodeKind.HTML): 

182 if html_tag.tag == "i" and "mention" in html_tag.attrs.get("class", ""): 

183 word = clean_node(wxr, None, html_tag) 

184 if word != "": 184 ↛ 181line 184 didn't jump to line 181 because the condition on line 184 was always true

185 if t_node.template_name in ALT_OF_TEMPLATES: 

186 sense.alt_of.append(AltForm(word=word)) 

187 else: 

188 sense.form_of.append(AltForm(word=word)) 

189 

190 

191def extract_defdate_template( 

192 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

193): 

194 expanded_node = wxr.wtp.parse( 

195 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

196 ) 

197 date = clean_node(wxr, None, expanded_node).strip("[]") 

198 if date != "": 198 ↛ exitline 198 didn't return from function 'extract_defdate_template' because the condition on line 198 was always true

199 sense.attestations.append(Attestation(date=date))