Coverage for src / wiktextract / extractor / ja / page.py: 84%

90 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-17 07:22 +0000

1import re 

2from typing import Any 

3 

4from mediawiki_langcodes import name_to_code 

5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind 

6 

7from ...page import clean_node 

8from ...wxr_context import WiktextractContext 

9from .conjugation import extract_conjugation_section 

10from .etymology import extract_etymology_section 

11from .kanji import extract_ja_kanji 

12from .linkage import extract_alt_form_section, extract_linkage_section 

13from .models import Sense, WordEntry 

14from .pos import extract_note_section, parse_pos_section 

15from .section_titles import LINKAGES, POS_DATA 

16from .sound import extract_homophone_section, extract_sound_section 

17from .translation import extract_translation_section 

18 

19 

20def parse_section( 

21 wxr: WiktextractContext, 

22 page_data: list[WordEntry], 

23 base_data: WordEntry, 

24 level_node: LevelNode, 

25) -> None: 

26 title_texts = re.sub( 

27 r"[\s\d]+$", "", clean_node(wxr, None, level_node.largs) 

28 ) 

29 for title_text in re.split(r":|:|・", title_texts): 29 ↛ 118line 29 didn't jump to line 118 because the loop on line 29 didn't complete

30 if title_text in POS_DATA: 

31 pre_len = len(page_data) 

32 parse_pos_section(wxr, page_data, base_data, level_node, title_text) 

33 if ( 

34 len(page_data) == pre_len 

35 and title_text in LINKAGES 

36 and pre_len > 0 

37 ): 

38 extract_linkage_section( 

39 wxr, page_data[-1], level_node, LINKAGES[title_text] 

40 ) 

41 break 

42 elif ( 

43 title_text in ["語源", "由来", "字源", "出典", "語誌"] 

44 and wxr.config.capture_etymologies 

45 ): 

46 extract_etymology_section(wxr, page_data, base_data, level_node) 

47 break 

48 elif ( 

49 title_text.startswith(("発音", "音価")) 

50 and wxr.config.capture_pronunciation 

51 ): 

52 extract_sound_section(wxr, page_data, base_data, level_node) 

53 break 

54 elif title_text in ["翻訳", "訳語"] and wxr.config.capture_translations: 

55 extract_translation_section( 

56 wxr, 

57 page_data[-1] if len(page_data) > 0 else base_data, 

58 level_node, 

59 ) 

60 break 

61 elif title_text in LINKAGES and wxr.config.capture_linkages: 

62 extract_linkage_section( 

63 wxr, 

64 page_data[-1] 

65 if len(page_data) > 0 

66 and page_data[-1].lang_code == base_data.lang_code 

67 else base_data, 

68 level_node, 

69 LINKAGES[title_text], 

70 ) 

71 break 

72 elif ( 

73 title_text in ["活用", "サ変動詞"] 

74 and wxr.config.capture_inflections 

75 ): 

76 extract_conjugation_section( 

77 wxr, 

78 page_data[-1] if len(page_data) > 0 else base_data, 

79 level_node, 

80 ) 

81 break 

82 elif title_text in [ 

83 "異表記", 

84 "別表記", 

85 "代替表記", 

86 "異形", 

87 "表記揺れ", 

88 ]: # "異表記・別形", Template:alter 

89 extract_alt_form_section( 

90 wxr, 

91 page_data[-1] 

92 if len(page_data) > 0 

93 and page_data[-1].lang_code == base_data.lang_code 

94 else base_data, 

95 level_node, 

96 ) 

97 break 

98 elif title_text in [ 98 ↛ 108line 98 didn't jump to line 108 because the condition on line 98 was never true

99 "用法", 

100 "注意点", 

101 "留意点", 

102 "注意", 

103 "備考", 

104 "表記", 

105 "補足", 

106 "語法", 

107 ]: 

108 extract_note_section( 

109 wxr, 

110 page_data[-1] if len(page_data) > 0 else base_data, 

111 level_node, 

112 ) 

113 break 

114 elif title_text == "同音異義語": 114 ↛ 29line 114 didn't jump to line 29 because the condition on line 114 was always true

115 extract_homophone_section(wxr, page_data, base_data, level_node) 

116 break 

117 else: 

118 if title_text not in [ 

119 "脚注", 

120 "参照", 

121 "参考文献", 

122 "参考", 

123 "同音の漢字", 

124 "参考辞書", 

125 "外部リンク", 

126 ]: 

127 wxr.wtp.debug( 

128 f"Unknown section: {title_text}", 

129 sortid="extractor/ja/page/parse_section/93", 

130 ) 

131 

132 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

133 parse_section(wxr, page_data, base_data, next_level) 

134 

135 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

136 if t_node.template_name.endswith("-cat"): 136 ↛ 137line 136 didn't jump to line 137 because the condition on line 136 was never true

137 clean_node( 

138 wxr, page_data[-1] if len(page_data) > 0 else base_data, t_node 

139 ) 

140 

141 

142def parse_page( 

143 wxr: WiktextractContext, page_title: str, page_text: str 

144) -> list[dict[str, Any]]: 

145 # page layout 

146 # https://ja.wiktionary.org/wiki/Wiktionary:スタイルマニュアル 

147 if page_title.startswith( 147 ↛ 150line 147 didn't jump to line 150 because the condition on line 147 was never true

148 ("Appendix:", "シソーラス:") 

149 ) or page_title.endswith("(活用)"): 

150 return [] 

151 wxr.wtp.start_page(page_title) 

152 tree = wxr.wtp.parse(page_text) 

153 page_data: list[WordEntry] = [] 

154 for level2_node in tree.find_child(NodeKind.LEVEL2): 

155 lang_name = clean_node(wxr, None, level2_node.largs) 

156 if lang_name == "": 156 ↛ 157line 156 didn't jump to line 157 because the condition on line 156 was never true

157 lang_name = "unknown" 

158 lang_code = "unknown" 

159 else: 

160 lang_code = name_to_code(lang_name, "ja") 

161 if lang_code == "": 

162 for template in level2_node.find_content(NodeKind.TEMPLATE): 

163 if template.template_name == "L": 

164 lang_code = template.template_parameters.get(1, "") 

165 elif re.fullmatch(r"[a-z-]+", template.template_name): 165 ↛ 162line 165 didn't jump to line 162 because the condition on line 165 was always true

166 lang_code = template.template_name 

167 if lang_code == "": 

168 lang_code = "unknown" 

169 wxr.wtp.start_section(lang_name) 

170 base_data = WordEntry( 

171 word=wxr.wtp.title, 

172 lang_code=lang_code, 

173 lang=lang_name, 

174 pos="unknown", 

175 ) 

176 for link_node in level2_node.find_child(NodeKind.LINK): 

177 clean_node(wxr, base_data, link_node) 

178 for t_node in level2_node.find_child(NodeKind.TEMPLATE): 

179 if t_node.template_name.endswith("-cat"): 179 ↛ 181line 179 didn't jump to line 181 because the condition on line 179 was always true

180 clean_node(wxr, base_data, t_node) 

181 elif t_node.template_name == "ja-kanji": 

182 extract_ja_kanji(wxr, base_data, t_node) 

183 for level3_node in level2_node.find_child(NodeKind.LEVEL3): 

184 parse_section(wxr, page_data, base_data, level3_node) 

185 

186 for data in page_data: 

187 if len(data.senses) == 0: 187 ↛ 188line 187 didn't jump to line 188 because the condition on line 187 was never true

188 data.senses.append(Sense(tags=["no-gloss"])) 

189 return [m.model_dump(exclude_defaults=True) for m in page_data]