Coverage for src/wiktextract/extractor/ms/pos.py: 95%

1from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode

3from ...page import clean_node

4from ...wxr_context import WiktextractContext

5from .example import extract_example_list_item

6from .models import AltForm, Form, Sense, WordEntry

7from .section_titles import POS_DATA

8from .tags import translate_raw_tags

10POS_HEADER_TEMPLATE_SUFFIXES = (

11 "-ks",

12 "-adj",

13 "-kn",

14 "-noun",

15 "-kk",

16 "-verb",

17 "-kerja",

18 "-kgn",

19 "-pron",

20 "-kkt",

21 "-adv",

22 "-kp",

23 "-sendi",

24 "-prep",

25 "-seru",

26 "-kanji",

27 "-hanzi",

28 "-hanja",

29 "-conj",

30 "-hantu",

31)

33FORM_OF_TEMPLATES = {"ja-perumian", "jamak", "alt case"}

34ALT_OF_TEMPLATES = {"alt case", "alternative case form of"}

37def extract_pos_section(

38 wxr: WiktextractContext,

39 page_data: list[WordEntry],

40 base_data: WordEntry,

41 level_node: LevelNode,

42 pos_title: str,

43) -> None:

44 page_data.append(base_data.model_copy(deep=True))

45 page_data[-1].pos_title = pos_title

46 pos_data = POS_DATA[pos_title.lower()]

47 page_data[-1].pos = pos_data["pos"]

48 page_data[-1].tags.extend(pos_data.get("tags", []))

50 gloss_list_index = len(level_node.children)

51 for index, node in enumerate(level_node.children):

52 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:

53 for list_item in node.find_child(NodeKind.LIST_ITEM):

54 if node.sarg.startswith("#") and node.sarg.endswith("#"):

55 extract_gloss_list_item(wxr, page_data[-1], list_item)

56 if index < gloss_list_index:

57 gloss_list_index = index

58 elif isinstance(node, TemplateNode) and (

59 node.template_name.endswith(POS_HEADER_TEMPLATE_SUFFIXES)

60 or node.template_name in ["inti", "head", "Han char"]

61 ):

62 extract_pos_header_template(wxr, page_data, base_data, node)

64 if len(page_data[-1].senses) == 0:

65 page_data.pop()

68def extract_gloss_list_item(

69 wxr: WiktextractContext,

70 word_entry: WordEntry,

71 list_item: WikiNode,

72 parent_sense: Sense | None = None,

73) -> None:

74 sense = (

75 parent_sense.model_copy(deep=True)

76 if parent_sense is not None

77 else Sense()

78 )

79 gloss_nodes = []

80 for node in list_item.children:

81 if isinstance(node, TemplateNode) and node.template_name in [

82 "label",

83 "lb",

84 "konteks",

85 "context",

86 "konteks 1",

87 "context 2",

88 ]:

89 extract_label_template(wxr, sense, node)

90 elif isinstance(node, TemplateNode) and (

91 node.template_name.endswith(" of")

92 or node.template_name in FORM_OF_TEMPLATES

93 or node.template_name in ALT_OF_TEMPLATES

94 ):

95 extract_form_of_template(wxr, sense, node)

96 gloss_nodes.append(node)

97 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):

98 gloss_nodes.append(node)

99 gloss_str = clean_node(wxr, sense, gloss_nodes)

100 if gloss_str != "": 100 ↛ 102line 100 didn't jump to line 102 because the condition on line 100 was always true

101 sense.glosses.append(gloss_str)

102 if len(sense.glosses) > 0: 102 ↛ 106line 102 didn't jump to line 106 because the condition on line 102 was always true

103 translate_raw_tags(sense)

104 word_entry.senses.append(sense)

105

106 for child_list in list_item.find_child(NodeKind.LIST):

107 if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"):

108 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):

109 extract_gloss_list_item(wxr, word_entry, child_list_item, sense)

110 elif child_list.sarg.startswith("#") and child_list.sarg.endswith( 110 ↛ 106line 110 didn't jump to line 106 because the condition on line 110 was always true

111 (":", "*")

112 ):

113 for e_list_item in child_list.find_child(NodeKind.LIST_ITEM):

114 extract_example_list_item(wxr, word_entry, sense, e_list_item)

115

116

117def extract_pos_header_template(

118 wxr: WiktextractContext,

119 page_data: list[WordEntry],

120 base_data: WordEntry,

121 t_node: TemplateNode,

122) -> None:

123 cats = {}

124 expanded_template = wxr.wtp.parse(

125 wxr.wtp.node_to_wikitext(t_node), expand_all=True

126 )

127 for link_node in expanded_template.find_child(NodeKind.LINK):

128 clean_node(wxr, cats, link_node)

129 pos_type = "unknown"

130 pos_tags = []

131 for cat in cats.get("categories", []): 131 ↛ 139line 131 didn't jump to line 139 because the loop on line 131 didn't complete

132 for pos_title, pos_data in POS_DATA.items():

133 if cat.lower().startswith(pos_title):

134 pos_type = pos_data["pos"]

135 pos_tags = pos_data.get("tags", [])

136 break

137 if pos_type != "unknown":

138 break

139 if page_data[-1].pos_title == "Takrifan" and page_data[-1].pos != "unknown":

140 page_data.append(base_data.model_copy(deep=True))

141 page_data[-1].pos = pos_type

142 page_data[-1].pos_title = "Takrifan"

143 page_data[-1].tags.extend(pos_tags)

144 if page_data[-1].pos == "unknown":

145 page_data[-1].pos = pos_type

146 page_data[-1].tags.extend(pos_tags)

147 page_data[-1].categories.extend(cats.get("categories", []))

148

149 raw_tag = ""

150 for node in expanded_template.find_child_recursively(NodeKind.HTML):

151 match node.tag:

152 case "i":

153 raw_tag = clean_node(wxr, None, node)

154 case "b":

155 form = Form(form=clean_node(wxr, None, node))

156 if raw_tag != "": 156 ↛ 158line 156 didn't jump to line 158 because the condition on line 156 was always true

157 form.raw_tags.append(raw_tag)

158 if form.form != "": 158 ↛ 150line 158 didn't jump to line 150 because the condition on line 158 was always true

159 translate_raw_tags(form)

160 page_data[-1].forms.append(form)

161

162

163def extract_label_template(

164 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode

165) -> None:

166 text = clean_node(wxr, sense, t_node).strip("() ")

167 for raw_tag in text.split(","):

168 raw_tag = raw_tag.strip()

169 if raw_tag != "": 169 ↛ 167line 169 didn't jump to line 167 because the condition on line 169 was always true

170 sense.raw_tags.append(raw_tag)

171

172

173def extract_form_of_template(

174 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode

175) -> None:

176 expanded_template = wxr.wtp.parse(

177 wxr.wtp.node_to_wikitext(t_node), expand_all=True

178 )

179 for html_tag in expanded_template.find_child_recursively(NodeKind.HTML):

180 if html_tag.tag == "i" and "mention" in html_tag.attrs.get("class", ""):

181 word = clean_node(wxr, None, html_tag)

182 if word != "": 182 ↛ 179line 182 didn't jump to line 179 because the condition on line 182 was always true

183 if t_node.template_name in ALT_OF_TEMPLATES:

184 sense.alt_of.append(AltForm(word=word))

185 else:

186 sense.form_of.append(AltForm(word=word))