Coverage for src/wiktextract/extractor/ms/pos.py: 95%

103 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

2 

3from ...page import clean_node 

4from ...wxr_context import WiktextractContext 

5from .example import extract_example_list_item 

6from .models import AltForm, Form, Sense, WordEntry 

7from .section_titles import POS_DATA 

8from .tags import translate_raw_tags 

9 

10POS_HEADER_TEMPLATE_SUFFIXES = ( 

11 "-ks", 

12 "-adj", 

13 "-kn", 

14 "-noun", 

15 "-kk", 

16 "-verb", 

17 "-kerja", 

18 "-kgn", 

19 "-pron", 

20 "-kkt", 

21 "-adv", 

22 "-kp", 

23 "-sendi", 

24 "-prep", 

25 "-seru", 

26 "-kanji", 

27 "-hanzi", 

28 "-hanja", 

29 "-conj", 

30 "-hantu", 

31) 

32 

33FORM_OF_TEMPLATES = {"ja-perumian", "jamak", "alt case"} 

34ALT_OF_TEMPLATES = {"alt case", "alternative case form of"} 

35 

36 

37def extract_pos_section( 

38 wxr: WiktextractContext, 

39 page_data: list[WordEntry], 

40 base_data: WordEntry, 

41 level_node: LevelNode, 

42 pos_title: str, 

43) -> None: 

44 page_data.append(base_data.model_copy(deep=True)) 

45 page_data[-1].pos_title = pos_title 

46 pos_data = POS_DATA[pos_title.lower()] 

47 page_data[-1].pos = pos_data["pos"] 

48 page_data[-1].tags.extend(pos_data.get("tags", [])) 

49 

50 gloss_list_index = len(level_node.children) 

51 for index, node in enumerate(level_node.children): 

52 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

53 for list_item in node.find_child(NodeKind.LIST_ITEM): 

54 if node.sarg.startswith("#") and node.sarg.endswith("#"): 

55 extract_gloss_list_item(wxr, page_data[-1], list_item) 

56 if index < gloss_list_index: 

57 gloss_list_index = index 

58 elif isinstance(node, TemplateNode) and ( 

59 node.template_name.endswith(POS_HEADER_TEMPLATE_SUFFIXES) 

60 or node.template_name in ["inti", "head", "Han char"] 

61 ): 

62 extract_pos_header_template(wxr, page_data, base_data, node) 

63 

64 if len(page_data[-1].senses) == 0: 

65 page_data.pop() 

66 

67 

68def extract_gloss_list_item( 

69 wxr: WiktextractContext, 

70 word_entry: WordEntry, 

71 list_item: WikiNode, 

72 parent_sense: Sense | None = None, 

73) -> None: 

74 sense = ( 

75 parent_sense.model_copy(deep=True) 

76 if parent_sense is not None 

77 else Sense() 

78 ) 

79 gloss_nodes = [] 

80 for node in list_item.children: 

81 if isinstance(node, TemplateNode) and node.template_name in [ 

82 "label", 

83 "lb", 

84 "konteks", 

85 "context", 

86 "konteks 1", 

87 "context 2", 

88 ]: 

89 extract_label_template(wxr, sense, node) 

90 elif isinstance(node, TemplateNode) and ( 

91 node.template_name.endswith(" of") 

92 or node.template_name in FORM_OF_TEMPLATES 

93 or node.template_name in ALT_OF_TEMPLATES 

94 ): 

95 extract_form_of_template(wxr, sense, node) 

96 gloss_nodes.append(node) 

97 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST): 

98 gloss_nodes.append(node) 

99 gloss_str = clean_node(wxr, sense, gloss_nodes) 

100 if gloss_str != "": 100 ↛ 102line 100 didn't jump to line 102 because the condition on line 100 was always true

101 sense.glosses.append(gloss_str) 

102 if len(sense.glosses) > 0: 102 ↛ 106line 102 didn't jump to line 106 because the condition on line 102 was always true

103 translate_raw_tags(sense) 

104 word_entry.senses.append(sense) 

105 

106 for child_list in list_item.find_child(NodeKind.LIST): 

107 if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"): 

108 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

109 extract_gloss_list_item(wxr, word_entry, child_list_item, sense) 

110 elif child_list.sarg.startswith("#") and child_list.sarg.endswith( 110 ↛ 106line 110 didn't jump to line 106 because the condition on line 110 was always true

111 (":", "*") 

112 ): 

113 for e_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

114 extract_example_list_item(wxr, word_entry, sense, e_list_item) 

115 

116 

117def extract_pos_header_template( 

118 wxr: WiktextractContext, 

119 page_data: list[WordEntry], 

120 base_data: WordEntry, 

121 t_node: TemplateNode, 

122) -> None: 

123 cats = {} 

124 expanded_template = wxr.wtp.parse( 

125 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

126 ) 

127 for link_node in expanded_template.find_child(NodeKind.LINK): 

128 clean_node(wxr, cats, link_node) 

129 pos_type = "unknown" 

130 pos_tags = [] 

131 for cat in cats.get("categories", []): 131 ↛ 139line 131 didn't jump to line 139 because the loop on line 131 didn't complete

132 for pos_title, pos_data in POS_DATA.items(): 

133 if cat.lower().startswith(pos_title): 

134 pos_type = pos_data["pos"] 

135 pos_tags = pos_data.get("tags", []) 

136 break 

137 if pos_type != "unknown": 

138 break 

139 if page_data[-1].pos_title == "Takrifan" and page_data[-1].pos != "unknown": 

140 page_data.append(base_data.model_copy(deep=True)) 

141 page_data[-1].pos = pos_type 

142 page_data[-1].pos_title = "Takrifan" 

143 page_data[-1].tags.extend(pos_tags) 

144 if page_data[-1].pos == "unknown": 

145 page_data[-1].pos = pos_type 

146 page_data[-1].tags.extend(pos_tags) 

147 page_data[-1].categories.extend(cats.get("categories", [])) 

148 

149 raw_tag = "" 

150 for node in expanded_template.find_child_recursively(NodeKind.HTML): 

151 match node.tag: 

152 case "i": 

153 raw_tag = clean_node(wxr, None, node) 

154 case "b": 

155 form = Form(form=clean_node(wxr, None, node)) 

156 if raw_tag != "": 156 ↛ 158line 156 didn't jump to line 158 because the condition on line 156 was always true

157 form.raw_tags.append(raw_tag) 

158 if form.form != "": 158 ↛ 150line 158 didn't jump to line 150 because the condition on line 158 was always true

159 translate_raw_tags(form) 

160 page_data[-1].forms.append(form) 

161 

162 

163def extract_label_template( 

164 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

165) -> None: 

166 text = clean_node(wxr, sense, t_node).strip("() ") 

167 for raw_tag in text.split(","): 

168 raw_tag = raw_tag.strip() 

169 if raw_tag != "": 169 ↛ 167line 169 didn't jump to line 167 because the condition on line 169 was always true

170 sense.raw_tags.append(raw_tag) 

171 

172 

173def extract_form_of_template( 

174 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

175) -> None: 

176 expanded_template = wxr.wtp.parse( 

177 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

178 ) 

179 for html_tag in expanded_template.find_child_recursively(NodeKind.HTML): 

180 if html_tag.tag == "i" and "mention" in html_tag.attrs.get("class", ""): 

181 word = clean_node(wxr, None, html_tag) 

182 if word != "": 182 ↛ 179line 182 didn't jump to line 179 because the condition on line 182 was always true

183 if t_node.template_name in ALT_OF_TEMPLATES: 

184 sense.alt_of.append(AltForm(word=word)) 

185 else: 

186 sense.form_of.append(AltForm(word=word))