Coverage for src/wiktextract/extractor/ja/page.py: 81%

1import re

2from typing import Any

4from mediawiki_langcodes import name_to_code

5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind

7from ...page import clean_node

8from ...wxr_context import WiktextractContext

9from .conjugation import extract_conjugation_section

10from .etymology import extract_etymology_section

11from .linkage import extract_alt_form_section, extract_linkage_section

12from .models import Sense, WordEntry

13from .pos import extract_note_section, parse_pos_section

14from .section_titles import LINKAGES, POS_DATA

15from .sound import extract_homophone_section, extract_sound_section

16from .translation import extract_translation_section

19def parse_section(

20 wxr: WiktextractContext,

21 page_data: list[WordEntry],

22 base_data: WordEntry,

23 level_node: LevelNode,

24) -> None:

25 title_texts = re.sub(

26 r"[\s\d]+$", "", clean_node(wxr, None, level_node.largs)

27 )

28 for title_text in re.split(r"：|:|・", title_texts): 28 ↛ 99line 28 didn't jump to line 99 because the loop on line 28 didn't complete

29 if title_text in POS_DATA:

30 pre_len = len(page_data)

31 parse_pos_section(wxr, page_data, base_data, level_node, title_text)

32 if (

33 len(page_data) == pre_len

34 and title_text in LINKAGES

35 and pre_len > 0

36 ):

37 extract_linkage_section(

38 wxr, page_data[-1], level_node, LINKAGES[title_text]

39 )

40 break

41 elif (

42 title_text in ["語源", "由来", "字源", "出典"]

43 and wxr.config.capture_etymologies

44 ):

45 extract_etymology_section(wxr, page_data, base_data, level_node)

46 break

47 elif title_text.startswith("発音") and wxr.config.capture_pronunciation:

48 extract_sound_section(wxr, page_data, base_data, level_node)

49 break

50 elif title_text == "翻訳" and wxr.config.capture_translations: 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true

51 extract_translation_section(

52 wxr,

53 page_data[-1] if len(page_data) > 0 else base_data,

54 level_node,

55 )

56 break

57 elif title_text in LINKAGES and wxr.config.capture_linkages:

58 extract_linkage_section(

59 wxr,

60 page_data[-1]

61 if len(page_data) > 0

62 and page_data[-1].lang_code == base_data.lang_code

63 else base_data,

64 level_node,

65 LINKAGES[title_text],

66 )

67 break

68 elif title_text == "活用" and wxr.config.capture_inflections: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true

69 extract_conjugation_section(

70 wxr,

71 page_data[-1] if len(page_data) > 0 else base_data,

72 level_node,

73 )

74 break

75 elif title_text in [ 75 ↛ 79line 75 didn't jump to line 79 because the condition on line 75 was never true

76 "異表記",

77 "別表記",

78 ]: # "異表記・別形", Template:alter

79 extract_alt_form_section(

80 wxr,

81 page_data[-1]

82 if len(page_data) > 0

83 and page_data[-1].lang_code == base_data.lang_code

84 else base_data,

85 level_node,

86 )

87 break

88 elif title_text in ["用法", "注意点", "留意点", "注意"]: 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true

89 extract_note_section(

90 wxr,

91 page_data[-1] if len(page_data) > 0 else base_data,

92 level_node,

93 )

94 break

95 elif title_text == "同音異義語": 95 ↛ 28line 95 didn't jump to line 28 because the condition on line 95 was always true

96 extract_homophone_section(wxr, page_data, base_data, level_node)

97 break

98 else:

99 if title_text not in ["脚注", "参照", "参考文献", "参考"]:

100 wxr.wtp.debug(

101 f"Unknown section: {title_text}",

102 sortid="extractor/ja/page/parse_section/93",

103 )

104

105 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):

106 parse_section(wxr, page_data, base_data, next_level)

107

108 for t_node in level_node.find_child(NodeKind.TEMPLATE):

109 if t_node.template_name.endswith("-cat"): 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true

110 clean_node(

111 wxr, page_data[-1] if len(page_data) > 0 else base_data, t_node

112 )

113

114

115def parse_page(

116 wxr: WiktextractContext, page_title: str, page_text: str

117) -> list[dict[str, Any]]:

118 # page layout

119 # https://ja.wiktionary.org/wiki/Wiktionary:スタイルマニュアル

120 if page_title.startswith(("Appendix:", "シソーラス:")): 120 ↛ 121line 120 didn't jump to line 121 because the condition on line 120 was never true

121 return []

122 wxr.wtp.start_page(page_title)

123 tree = wxr.wtp.parse(page_text)

124 page_data: list[WordEntry] = []

125 for level2_node in tree.find_child(NodeKind.LEVEL2):

126 lang_name = clean_node(wxr, None, level2_node.largs)

127 lang_code = name_to_code(lang_name, "ja")

128 if lang_code == "":

129 for template in level2_node.find_content(NodeKind.TEMPLATE):

130 if template.template_name == "L":

131 lang_code = template.template_parameters.get(1, "")

132 elif re.fullmatch(r"[a-z-]+", template.template_name): 132 ↛ 129line 132 didn't jump to line 129 because the condition on line 132 was always true

133 lang_code = template.template_name

134 if lang_code == "":

135 lang_code = "unknown"

136 wxr.wtp.start_section(lang_name)

137 base_data = WordEntry(

138 word=wxr.wtp.title,

139 lang_code=lang_code,

140 lang=lang_name,

141 pos="unknown",

142 )

143 for link_node in level2_node.find_child(NodeKind.LINK):

144 clean_node(wxr, base_data, link_node)

145 for level3_node in level2_node.find_child(NodeKind.LEVEL3):

146 parse_section(wxr, page_data, base_data, level3_node)

147

148 for data in page_data:

149 if len(data.senses) == 0: 149 ↛ 150line 149 didn't jump to line 150 because the condition on line 149 was never true

150 data.senses.append(Sense(tags=["no-gloss"]))

151 return [m.model_dump(exclude_defaults=True) for m in page_data]