Coverage for src/wiktextract/extractor/ja/page.py: 82%

84 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1import re 

2from typing import Any 

3 

4from mediawiki_langcodes import name_to_code 

5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind 

6 

7from ...page import clean_node 

8from ...wxr_context import WiktextractContext 

9from .conjugation import extract_conjugation_section 

10from .etymology import extract_etymology_section 

11from .linkage import extract_alt_form_section, extract_linkage_section 

12from .models import Sense, WordEntry 

13from .pos import extract_note_section, parse_pos_section 

14from .section_titles import LINKAGES, POS_DATA 

15from .sound import extract_homophone_section, extract_sound_section 

16from .translation import extract_translation_section 

17 

18 

19def parse_section( 

20 wxr: WiktextractContext, 

21 page_data: list[WordEntry], 

22 base_data: WordEntry, 

23 level_node: LevelNode, 

24) -> None: 

25 title_texts = re.sub( 

26 r"[\s\d]+$", "", clean_node(wxr, None, level_node.largs) 

27 ) 

28 for title_text in re.split(r":|:|・", title_texts): 28 ↛ 117line 28 didn't jump to line 117 because the loop on line 28 didn't complete

29 if title_text in POS_DATA: 

30 pre_len = len(page_data) 

31 parse_pos_section(wxr, page_data, base_data, level_node, title_text) 

32 if ( 

33 len(page_data) == pre_len 

34 and title_text in LINKAGES 

35 and pre_len > 0 

36 ): 

37 extract_linkage_section( 

38 wxr, page_data[-1], level_node, LINKAGES[title_text] 

39 ) 

40 break 

41 elif ( 

42 title_text in ["語源", "由来", "字源", "出典", "語誌"] 

43 and wxr.config.capture_etymologies 

44 ): 

45 extract_etymology_section(wxr, page_data, base_data, level_node) 

46 break 

47 elif ( 

48 title_text.startswith(("発音", "音価")) 

49 and wxr.config.capture_pronunciation 

50 ): 

51 extract_sound_section(wxr, page_data, base_data, level_node) 

52 break 

53 elif title_text in ["翻訳", "訳語"] and wxr.config.capture_translations: 

54 extract_translation_section( 

55 wxr, 

56 page_data[-1] if len(page_data) > 0 else base_data, 

57 level_node, 

58 ) 

59 break 

60 elif title_text in LINKAGES and wxr.config.capture_linkages: 

61 extract_linkage_section( 

62 wxr, 

63 page_data[-1] 

64 if len(page_data) > 0 

65 and page_data[-1].lang_code == base_data.lang_code 

66 else base_data, 

67 level_node, 

68 LINKAGES[title_text], 

69 ) 

70 break 

71 elif ( 71 ↛ 75line 71 didn't jump to line 75 because the condition on line 71 was never true

72 title_text in ["活用", "サ変動詞"] 

73 and wxr.config.capture_inflections 

74 ): 

75 extract_conjugation_section( 

76 wxr, 

77 page_data[-1] if len(page_data) > 0 else base_data, 

78 level_node, 

79 ) 

80 break 

81 elif title_text in [ 81 ↛ 88line 81 didn't jump to line 88 because the condition on line 81 was never true

82 "異表記", 

83 "別表記", 

84 "代替表記", 

85 "異形", 

86 "表記揺れ", 

87 ]: # "異表記・別形", Template:alter 

88 extract_alt_form_section( 

89 wxr, 

90 page_data[-1] 

91 if len(page_data) > 0 

92 and page_data[-1].lang_code == base_data.lang_code 

93 else base_data, 

94 level_node, 

95 ) 

96 break 

97 elif title_text in [ 97 ↛ 107line 97 didn't jump to line 107 because the condition on line 97 was never true

98 "用法", 

99 "注意点", 

100 "留意点", 

101 "注意", 

102 "備考", 

103 "表記", 

104 "補足", 

105 "語法", 

106 ]: 

107 extract_note_section( 

108 wxr, 

109 page_data[-1] if len(page_data) > 0 else base_data, 

110 level_node, 

111 ) 

112 break 

113 elif title_text == "同音異義語": 113 ↛ 28line 113 didn't jump to line 28 because the condition on line 113 was always true

114 extract_homophone_section(wxr, page_data, base_data, level_node) 

115 break 

116 else: 

117 if title_text not in [ 

118 "脚注", 

119 "参照", 

120 "参考文献", 

121 "参考", 

122 "同音の漢字", 

123 "参考辞書", 

124 "外部リンク", 

125 ]: 

126 wxr.wtp.debug( 

127 f"Unknown section: {title_text}", 

128 sortid="extractor/ja/page/parse_section/93", 

129 ) 

130 

131 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

132 parse_section(wxr, page_data, base_data, next_level) 

133 

134 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

135 if t_node.template_name.endswith("-cat"): 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true

136 clean_node( 

137 wxr, page_data[-1] if len(page_data) > 0 else base_data, t_node 

138 ) 

139 

140 

141def parse_page( 

142 wxr: WiktextractContext, page_title: str, page_text: str 

143) -> list[dict[str, Any]]: 

144 # page layout 

145 # https://ja.wiktionary.org/wiki/Wiktionary:スタイルマニュアル 

146 if page_title.startswith( 146 ↛ 149line 146 didn't jump to line 149 because the condition on line 146 was never true

147 ("Appendix:", "シソーラス:") 

148 ) or page_title.endswith("(活用)"): 

149 return [] 

150 wxr.wtp.start_page(page_title) 

151 tree = wxr.wtp.parse(page_text) 

152 page_data: list[WordEntry] = [] 

153 for level2_node in tree.find_child(NodeKind.LEVEL2): 

154 lang_name = clean_node(wxr, None, level2_node.largs) 

155 if lang_name == "": 155 ↛ 156line 155 didn't jump to line 156 because the condition on line 155 was never true

156 lang_name = "unknown" 

157 lang_code = "unknown" 

158 else: 

159 lang_code = name_to_code(lang_name, "ja") 

160 if lang_code == "": 

161 for template in level2_node.find_content(NodeKind.TEMPLATE): 

162 if template.template_name == "L": 

163 lang_code = template.template_parameters.get(1, "") 

164 elif re.fullmatch(r"[a-z-]+", template.template_name): 164 ↛ 161line 164 didn't jump to line 161 because the condition on line 164 was always true

165 lang_code = template.template_name 

166 if lang_code == "": 

167 lang_code = "unknown" 

168 wxr.wtp.start_section(lang_name) 

169 base_data = WordEntry( 

170 word=wxr.wtp.title, 

171 lang_code=lang_code, 

172 lang=lang_name, 

173 pos="unknown", 

174 ) 

175 for link_node in level2_node.find_child(NodeKind.LINK): 

176 clean_node(wxr, base_data, link_node) 

177 for level3_node in level2_node.find_child(NodeKind.LEVEL3): 

178 parse_section(wxr, page_data, base_data, level3_node) 

179 

180 for data in page_data: 

181 if len(data.senses) == 0: 181 ↛ 182line 181 didn't jump to line 182 because the condition on line 181 was never true

182 data.senses.append(Sense(tags=["no-gloss"])) 

183 return [m.model_dump(exclude_defaults=True) for m in page_data]