Coverage for src/wiktextract/extractor/vi/page.py: 79%

73 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-12 08:27 +0000

1from typing import Any 

2 

3from mediawiki_langcodes import name_to_code 

4from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .etymology import extract_etymology_section 

9from .linkage import extract_alt_form_section, extract_linkage_section 

10from .models import Sense, WordEntry 

11from .pos import extract_note_section, extract_pos_section 

12from .section_titles import LINKAGE_SECTIONS, POS_DATA, TRANSLATION_SECTIONS 

13from .sound import extract_sound_section 

14from .translation import extract_translation_section 

15 

16 

17def parse_section( 

18 wxr: WiktextractContext, 

19 page_data: list[WordEntry], 

20 base_data: WordEntry, 

21 level_node: LevelNode, 

22) -> None: 

23 subtitle = clean_node(wxr, None, level_node.largs) 

24 if subtitle in POS_DATA: 

25 extract_pos_section(wxr, page_data, base_data, level_node, subtitle) 

26 if len(page_data[-1].senses) == 0 and subtitle in LINKAGE_SECTIONS: 

27 page_data.pop() 

28 extract_linkage_section( 

29 wxr, 

30 page_data if len(page_data) > 0 else [base_data], 

31 level_node, 

32 LINKAGE_SECTIONS[subtitle], 

33 ) 

34 elif subtitle in TRANSLATION_SECTIONS: 

35 extract_translation_section( 

36 wxr, page_data[-1] if len(page_data) else base_data, level_node 

37 ) 

38 elif subtitle == "Cách phát âm": 

39 extract_sound_section(wxr, base_data, level_node) 

40 elif subtitle == "Từ nguyên": 40 ↛ 41line 40 didn't jump to line 41 because the condition on line 40 was never true

41 extract_etymology_section(wxr, base_data, level_node) 

42 elif subtitle == "Cách viết khác": 

43 extract_alt_form_section(wxr, base_data, page_data, level_node) 

44 elif subtitle == "Ghi chú sử dụng": 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true

45 extract_note_section( 

46 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

47 ) 

48 elif subtitle in LINKAGE_SECTIONS: 48 ↛ 55line 48 didn't jump to line 55 because the condition on line 48 was always true

49 extract_linkage_section( 

50 wxr, 

51 page_data if len(page_data) > 0 else [base_data], 

52 level_node, 

53 LINKAGE_SECTIONS[subtitle], 

54 ) 

55 elif subtitle not in ["Tham khảo", "Cách ra dấu", "Đọc thêm", "Xem thêm"]: 

56 wxr.wtp.debug(f"Unknown title: {subtitle}", sortid="vi/page/22") 

57 

58 extract_section_cats(wxr, base_data, page_data, level_node) 

59 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

60 parse_section(wxr, page_data, base_data, next_level) 

61 

62 

63def parse_page( 

64 wxr: WiktextractContext, page_title: str, page_text: str 

65) -> list[dict[str, Any]]: 

66 # page layout 

67 # https://vi.wiktionary.org/wiki/Wiktionary:Sơ_đồ_mục_từ 

68 

69 # ignore thesaurus, rhyme, quote, reconstruct pages 

70 if page_title.startswith( 70 ↛ 73line 70 didn't jump to line 73 because the condition on line 70 was never true

71 ("Kho từ vựng:", "Vần:", "Kho ngữ liệu:", "Từ tái tạo:") 

72 ): 

73 return [] 

74 

75 wxr.wtp.start_page(page_title) 

76 tree = wxr.wtp.parse(page_text, pre_expand=True) 

77 page_data = [] 

78 for level2_node in tree.find_child(NodeKind.LEVEL2): 

79 categories = {} 

80 lang_name = clean_node(wxr, categories, level2_node.largs) or "unknown" 

81 lang_code = name_to_code(lang_name, "vi") or "unknown" 

82 for t_node in level2_node.find_content(NodeKind.TEMPLATE): 82 ↛ 83line 82 didn't jump to line 83 because the loop on line 82 never started

83 if t_node.template_name == "langname": 

84 lang_code = clean_node( 

85 wxr, None, t_node.template_parameters.get(1, "") 

86 ) 

87 if ( 87 ↛ 91line 87 didn't jump to line 91 because the condition on line 87 was never true

88 wxr.config.capture_language_codes is not None 

89 and lang_code not in wxr.config.capture_language_codes 

90 ): 

91 continue 

92 wxr.wtp.start_section(lang_name) 

93 base_data = WordEntry( 

94 word=wxr.wtp.title, 

95 lang_code=lang_code, 

96 lang=lang_name, 

97 pos="unknown", 

98 ) 

99 base_data.categories = categories.get("categories", []) 

100 extract_section_cats(wxr, base_data, page_data, level2_node) 

101 for next_level in level2_node.find_child(LEVEL_KIND_FLAGS): 

102 parse_section(wxr, page_data, base_data, next_level) 

103 

104 for data in page_data: 

105 if len(data.senses) == 0: 105 ↛ 106line 105 didn't jump to line 106 because the condition on line 105 was never true

106 data.senses.append(Sense(tags=["no-gloss"])) 

107 

108 return [d.model_dump(exclude_defaults=True) for d in page_data] 

109 

110 

111def extract_section_cats( 

112 wxr: WiktextractContext, 

113 base_data: WordEntry, 

114 page_data: list[WordEntry], 

115 level_node: LevelNode, 

116): 

117 cats = {} 

118 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LINK): 

119 if node.kind == NodeKind.LINK: 119 ↛ 120line 119 didn't jump to line 120 because the condition on line 119 was never true

120 clean_node(wxr, cats, node) 

121 elif node.template_name in [ 121 ↛ 128line 121 didn't jump to line 128 because the condition on line 121 was never true

122 "topics", 

123 "C", 

124 "topic", 

125 "catlangname", 

126 "cln", 

127 ]: 

128 clean_node(wxr, cats, node) 

129 

130 if len(page_data) == 0 or page_data[-1].lang_code != base_data.lang_code: 

131 base_data.categories.extend(cats.get("categories", [])) 

132 else: 

133 for data in page_data: 

134 if data.lang_code == page_data[-1].lang_code: 134 ↛ 133line 134 didn't jump to line 133 because the condition on line 134 was always true

135 data.categories.extend(cats.get("categories", []))