Coverage for src/wiktextract/extractor/ja/page.py: 81%

81 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-06-13 07:43 +0000

1import re 

2from typing import Any 

3 

4from mediawiki_langcodes import name_to_code 

5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind 

6 

7from ...page import clean_node 

8from ...wxr_context import WiktextractContext 

9from .conjugation import extract_conjugation_section 

10from .etymology import extract_etymology_section 

11from .linkage import extract_alt_form_section, extract_linkage_section 

12from .models import Sense, WordEntry 

13from .pos import extract_note_section, parse_pos_section 

14from .section_titles import LINKAGES, POS_DATA 

15from .sound import extract_homophone_section, extract_sound_section 

16from .translation import extract_translation_section 

17 

18 

19def parse_section( 

20 wxr: WiktextractContext, 

21 page_data: list[WordEntry], 

22 base_data: WordEntry, 

23 level_node: LevelNode, 

24) -> None: 

25 title_texts = re.sub( 

26 r"[\s\d]+$", "", clean_node(wxr, None, level_node.largs) 

27 ) 

28 for title_text in re.split(r":|:|・", title_texts): 28 ↛ 99line 28 didn't jump to line 99 because the loop on line 28 didn't complete

29 if title_text in POS_DATA: 

30 pre_len = len(page_data) 

31 parse_pos_section(wxr, page_data, base_data, level_node, title_text) 

32 if ( 

33 len(page_data) == pre_len 

34 and title_text in LINKAGES 

35 and pre_len > 0 

36 ): 

37 extract_linkage_section( 

38 wxr, page_data[-1], level_node, LINKAGES[title_text] 

39 ) 

40 break 

41 elif ( 

42 title_text in ["語源", "由来", "字源", "出典"] 

43 and wxr.config.capture_etymologies 

44 ): 

45 extract_etymology_section(wxr, page_data, base_data, level_node) 

46 break 

47 elif title_text.startswith("発音") and wxr.config.capture_pronunciation: 

48 extract_sound_section(wxr, page_data, base_data, level_node) 

49 break 

50 elif title_text == "翻訳" and wxr.config.capture_translations: 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true

51 extract_translation_section( 

52 wxr, 

53 page_data[-1] if len(page_data) > 0 else base_data, 

54 level_node, 

55 ) 

56 break 

57 elif title_text in LINKAGES and wxr.config.capture_linkages: 

58 extract_linkage_section( 

59 wxr, 

60 page_data[-1] 

61 if len(page_data) > 0 

62 and page_data[-1].lang_code == base_data.lang_code 

63 else base_data, 

64 level_node, 

65 LINKAGES[title_text], 

66 ) 

67 break 

68 elif title_text == "活用" and wxr.config.capture_inflections: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true

69 extract_conjugation_section( 

70 wxr, 

71 page_data[-1] if len(page_data) > 0 else base_data, 

72 level_node, 

73 ) 

74 break 

75 elif title_text in [ 75 ↛ 79line 75 didn't jump to line 79 because the condition on line 75 was never true

76 "異表記", 

77 "別表記", 

78 ]: # "異表記・別形", Template:alter 

79 extract_alt_form_section( 

80 wxr, 

81 page_data[-1] 

82 if len(page_data) > 0 

83 and page_data[-1].lang_code == base_data.lang_code 

84 else base_data, 

85 level_node, 

86 ) 

87 break 

88 elif title_text in ["用法", "注意点", "留意点", "注意"]: 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true

89 extract_note_section( 

90 wxr, 

91 page_data[-1] if len(page_data) > 0 else base_data, 

92 level_node, 

93 ) 

94 break 

95 elif title_text == "同音異義語": 95 ↛ 28line 95 didn't jump to line 28 because the condition on line 95 was always true

96 extract_homophone_section(wxr, page_data, base_data, level_node) 

97 break 

98 else: 

99 if title_text not in ["脚注", "参照", "参考文献", "参考"]: 

100 wxr.wtp.debug( 

101 f"Unknown section: {title_text}", 

102 sortid="extractor/ja/page/parse_section/93", 

103 ) 

104 

105 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

106 parse_section(wxr, page_data, base_data, next_level) 

107 

108 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

109 if t_node.template_name.endswith("-cat"): 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true

110 clean_node( 

111 wxr, page_data[-1] if len(page_data) > 0 else base_data, t_node 

112 ) 

113 

114 

115def parse_page( 

116 wxr: WiktextractContext, page_title: str, page_text: str 

117) -> list[dict[str, Any]]: 

118 # page layout 

119 # https://ja.wiktionary.org/wiki/Wiktionary:スタイルマニュアル 

120 if page_title.startswith(("Appendix:", "シソーラス:")): 120 ↛ 121line 120 didn't jump to line 121 because the condition on line 120 was never true

121 return [] 

122 wxr.wtp.start_page(page_title) 

123 tree = wxr.wtp.parse(page_text) 

124 page_data: list[WordEntry] = [] 

125 for level2_node in tree.find_child(NodeKind.LEVEL2): 

126 lang_name = clean_node(wxr, None, level2_node.largs) 

127 lang_code = name_to_code(lang_name, "ja") 

128 if lang_code == "": 

129 for template in level2_node.find_content(NodeKind.TEMPLATE): 

130 if template.template_name == "L": 

131 lang_code = template.template_parameters.get(1, "") 

132 elif re.fullmatch(r"[a-z-]+", template.template_name): 132 ↛ 129line 132 didn't jump to line 129 because the condition on line 132 was always true

133 lang_code = template.template_name 

134 if lang_code == "": 

135 lang_code = "unknown" 

136 wxr.wtp.start_section(lang_name) 

137 base_data = WordEntry( 

138 word=wxr.wtp.title, 

139 lang_code=lang_code, 

140 lang=lang_name, 

141 pos="unknown", 

142 ) 

143 for link_node in level2_node.find_child(NodeKind.LINK): 

144 clean_node(wxr, base_data, link_node) 

145 for level3_node in level2_node.find_child(NodeKind.LEVEL3): 

146 parse_section(wxr, page_data, base_data, level3_node) 

147 

148 for data in page_data: 

149 if len(data.senses) == 0: 149 ↛ 150line 149 didn't jump to line 150 because the condition on line 149 was never true

150 data.senses.append(Sense(tags=["no-gloss"])) 

151 return [m.model_dump(exclude_defaults=True) for m in page_data]