Coverage for src/wiktextract/extractor/vi/page.py: 40%

1import string

2from typing import Any

4from mediawiki_langcodes import name_to_code

5from wikitextprocessor.parser import (

6 LEVEL_KIND_FLAGS,

7 HTMLNode,

8 LevelNode,

9 NodeKind,

10 TemplateNode,

11 WikiNode,

12)

14from ...page import clean_node

15from ...wxr_context import WiktextractContext

16from .descendant import extract_descendant_section

17from .etymology import extract_etymology_section, extract_ja_kanjitab_template

18from .linkage import extract_alt_form_section, extract_linkage_section

19from .models import Form, Sense, WordEntry

20from .pos import extract_note_section, extract_pos_section

21from .section_titles import LINKAGE_SECTIONS, POS_DATA, TRANSLATION_SECTIONS

22from .sound import extract_homophone_section, extract_sound_section

23from .tags import translate_raw_tags

24from .translation import extract_translation_section

27def parse_section(

28 wxr: WiktextractContext,

29 page_data: list[WordEntry],

30 base_data: WordEntry,

31 level_node: LevelNode,

32) -> None:

33 subtitle = clean_node(wxr, None, level_node.largs)

34 subtitle = subtitle.rstrip(string.digits + string.whitespace)

35 if subtitle in POS_DATA:

36 extract_pos_section(wxr, page_data, base_data, level_node, subtitle)

37 if len(page_data[-1].senses) == 0 and subtitle in LINKAGE_SECTIONS:

38 page_data.pop()

39 extract_linkage_section(

40 wxr,

41 page_data if len(page_data) > 0 else [base_data],

42 level_node,

43 LINKAGE_SECTIONS[subtitle],

44 )

45 elif subtitle in TRANSLATION_SECTIONS:

46 extract_translation_section(

47 wxr, page_data[-1] if len(page_data) else base_data, level_node

48 )

49 elif subtitle == "Cách phát âm":

50 extract_sound_section(wxr, base_data, level_node)

51 elif subtitle == "Từ đồng âm": 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true

52 extract_homophone_section(wxr, base_data, level_node)

53 elif subtitle == "Từ nguyên":

54 if level_node.contain_node(LEVEL_KIND_FLAGS): 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true

55 base_data = base_data.model_copy(deep=True)

56 extract_etymology_section(wxr, base_data, level_node)

57 elif subtitle == "Cách viết khác":

58 extract_alt_form_section(wxr, base_data, page_data, level_node)

59 elif subtitle in ["Ghi chú sử dụng", "Chú ý"]: 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true

60 extract_note_section(

61 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node

62 )

63 elif subtitle in LINKAGE_SECTIONS: 63 ↛ 70line 63 didn't jump to line 70 because the condition on line 63 was always true

64 extract_linkage_section(

65 wxr,

66 page_data if len(page_data) > 0 else [base_data],

67 level_node,

68 LINKAGE_SECTIONS[subtitle],

69 )

70 elif subtitle == "Hậu duệ":

71 extract_descendant_section(

72 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node

73 )

74 elif subtitle not in ["Tham khảo", "Cách ra dấu", "Đọc thêm", "Xem thêm"]:

75 wxr.wtp.debug(f"Unknown title: {subtitle}", sortid="vi/page/22")

77 extract_section_cats(wxr, base_data, page_data, level_node)

78 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):

79 parse_section(wxr, page_data, base_data, next_level)

82def parse_page(

83 wxr: WiktextractContext, page_title: str, page_text: str

84) -> list[dict[str, Any]]:

85 # page layout

86 # https://vi.wiktionary.org/wiki/Wiktionary:Sơ_đồ_mục_từ

88 # ignore thesaurus, rhyme, quote, reconstruct pages

89 if page_title.startswith( 89 ↛ 92line 89 didn't jump to line 92 because the condition on line 89 was never true

90 ("Kho từ vựng:", "Vần:", "Kho ngữ liệu:", "Từ tái tạo:")

91 ):

92 return []

94 wxr.wtp.start_page(page_title)

95 tree = wxr.wtp.parse(page_text, pre_expand=True)

96 page_data = []

97 for level2_node in tree.find_child(NodeKind.LEVEL2):

98 categories = {}

99 lang_name = clean_node(wxr, categories, level2_node.largs) or "unknown"

100 lang_code = name_to_code(lang_name, "vi") or "unknown"

101 for t_node in level2_node.find_content(NodeKind.TEMPLATE):

102 if t_node.template_name == "langname": 102 ↛ 101line 102 didn't jump to line 101 because the condition on line 102 was always true

103 lang_code = clean_node(

104 wxr, None, t_node.template_parameters.get(1, "")

105 )

106 if ( 106 ↛ 110line 106 didn't jump to line 110 because the condition on line 106 was never true

107 wxr.config.capture_language_codes is not None

108 and lang_code not in wxr.config.capture_language_codes

109 ):

110 continue

111 wxr.wtp.start_section(lang_name)

112 base_data = WordEntry(

113 word=wxr.wtp.title,

114 lang_code=lang_code,

115 lang=lang_name,

116 pos="unknown",

117 )

118 base_data.categories = categories.get("categories", [])

119 extract_section_cats(wxr, base_data, page_data, level2_node)

120 for t_node in level2_node.find_child(NodeKind.TEMPLATE): 120 ↛ 121line 120 didn't jump to line 121 because the loop on line 120 never started

121 if t_node.template_name in ["zho-forms", "zh-forms"]:

122 extract_zh_forms_template(wxr, base_data, t_node)

123 elif t_node.template_name in ["zh-see", "zho-see"]:

124 base_data.redirects.append(

125 clean_node(wxr, None, t_node.template_parameters.get(1, ""))

126 )

127 clean_node(wxr, base_data, t_node)

128 elif t_node.template_name in ["ja-see", "jpn-see", "ja-see-kango"]:

129 for key, value in t_node.template_parameters.items():

130 if isinstance(key, int):

131 base_data.redirects.append(clean_node(wxr, None, value))

132 clean_node(wxr, base_data, t_node)

133 elif (

134 t_node.template_name.endswith("-kanjitab")

135 or t_node.template_name == "ja-kt"

136 ):

137 extract_ja_kanjitab_template(wxr, t_node, base_data)

138

139 if len(base_data.redirects) > 0: 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true

140 page_data.append(base_data)

141 for next_level in level2_node.find_child(LEVEL_KIND_FLAGS):

142 parse_section(wxr, page_data, base_data, next_level)

143

144 for data in page_data:

145 if len(data.senses) == 0: 145 ↛ 146line 145 didn't jump to line 146 because the condition on line 145 was never true

146 data.senses.append(Sense(tags=["no-gloss"]))

147

148 return [d.model_dump(exclude_defaults=True) for d in page_data]

149

150

151def extract_section_cats(

152 wxr: WiktextractContext,

153 base_data: WordEntry,

154 page_data: list[WordEntry],

155 level_node: LevelNode,

156):

157 cats = {}

158 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LINK):

159 if node.kind == NodeKind.LINK: 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true

160 clean_node(wxr, cats, node)

161 elif node.template_name in [ 161 ↛ 168line 161 didn't jump to line 168 because the condition on line 161 was never true

162 "topics",

163 "C",

164 "topic",

165 "catlangname",

166 "cln",

167 ]:

168 clean_node(wxr, cats, node)

169

170 if len(page_data) == 0 or page_data[-1].lang_code != base_data.lang_code:

171 base_data.categories.extend(cats.get("categories", []))

172 else:

173 for data in page_data:

174 if data.lang_code == page_data[-1].lang_code: 174 ↛ 173line 174 didn't jump to line 173 because the condition on line 174 was always true

175 data.categories.extend(cats.get("categories", []))

176

177

178def extract_zh_forms_template(

179 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode

180):

181 base_data.literal_meaning = clean_node(

182 wxr, None, t_node.template_parameters.get("lit", "")

183 )

184 expanded_node = wxr.wtp.parse(

185 wxr.wtp.node_to_wikitext(t_node), expand_all=True

186 )

187 for table in expanded_node.find_child(NodeKind.TABLE):

188 for row in table.find_child(NodeKind.TABLE_ROW):

189 row_header = ""

190 row_header_tags = []

191 header_has_span = False

192 for cell in row.find_child(

193 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL

194 ):

195 if cell.kind == NodeKind.TABLE_HEADER_CELL:

196 row_header, row_header_tags, header_has_span = (

197 extract_zh_forms_header_cell(wxr, base_data, cell)

198 )

199 elif not header_has_span:

200 extract_zh_forms_data_cell(

201 wxr, base_data, cell, row_header, row_header_tags

202 )

203 for link_node in expanded_node.find_child(NodeKind.LINK):

204 clean_node(wxr, base_data, link_node)

205

206

207def extract_zh_forms_header_cell(

208 wxr: WiktextractContext, base_data: WordEntry, header_cell: WikiNode

209) -> tuple[str, list[str], bool]:

210 row_header = ""

211 row_header_tags = []

212 header_has_span = False

213 first_span_index = len(header_cell.children)

214 for index, span_tag in header_cell.find_html("span", with_index=True):

215 if index < first_span_index:

216 first_span_index = index

217 header_has_span = True

218 row_header = clean_node(wxr, None, header_cell.children[:first_span_index])

219 for raw_tag in row_header.split(" và "):

220 raw_tag = raw_tag.strip()

221 if raw_tag != "":

222 row_header_tags.append(raw_tag)

223 for span_tag in header_cell.find_html_recursively("span"):

224 span_lang = span_tag.attrs.get("lang", "")

225 form_nodes = []

226 sup_title = ""

227 for node in span_tag.children:

228 if isinstance(node, HTMLNode) and node.tag == "sup":

229 for sup_span in node.find_html("span"):

230 sup_title = sup_span.attrs.get("title", "")

231 else:

232 form_nodes.append(node)

233 if span_lang in ["zh-Hant", "zh-Hans"]:

234 for word in clean_node(wxr, None, form_nodes).split("/"):

235 if word not in [base_data.word, ""]:

236 form = Form(form=word, raw_tags=row_header_tags)

237 if sup_title != "":

238 form.raw_tags.append(sup_title)

239 translate_raw_tags(form)

240 base_data.forms.append(form)

241 return row_header, row_header_tags, header_has_span

242

243

244def extract_zh_forms_data_cell(

245 wxr: WiktextractContext,

246 base_data: WordEntry,

247 cell: WikiNode,

248 row_header: str,

249 row_header_tags: list[str],

250):

251 for top_span_tag in cell.find_html("span"):

252 forms = []

253 for span_tag in top_span_tag.find_html("span"):

254 span_lang = span_tag.attrs.get("lang", "")

255 if span_lang in ["zh-Hant", "zh-Hans", "zh"]:

256 word = clean_node(wxr, None, span_tag)

257 if word not in ["", "／", base_data.word]:

258 form = Form(form=word, raw_tags=row_header_tags)

259 if span_lang == "zh-Hant":

260 form.tags.append("Traditional-Chinese")

261 elif span_lang == "zh-Hans":

262 form.tags.append("Simplified-Chinese")

263 translate_raw_tags(form)

264 forms.append(form)

265 elif "font-size:80%" in span_tag.attrs.get("style", ""):

266 raw_tag = clean_node(wxr, None, span_tag)

267 if raw_tag != "":

268 for form in forms:

269 form.raw_tags.append(raw_tag)

270 translate_raw_tags(form)

271 base_data.forms.extend(forms)

Coverage for src / wiktextract / extractor / vi / page.py: 40%

167 statements