Coverage for src/wiktextract/extractor/ja/page.py: 81%
81 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
1import re
2from typing import Any
4from mediawiki_langcodes import name_to_code
5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind
7from ...page import clean_node
8from ...wxr_context import WiktextractContext
9from .conjugation import extract_conjugation_section
10from .etymology import extract_etymology_section
11from .linkage import extract_alt_form_section, extract_linkage_section
12from .models import Sense, WordEntry
13from .pos import extract_note_section, parse_pos_section
14from .section_titles import LINKAGES, POS_DATA
15from .sound import extract_homophone_section, extract_sound_section
16from .translation import extract_translation_section
19def parse_section(
20 wxr: WiktextractContext,
21 page_data: list[WordEntry],
22 base_data: WordEntry,
23 level_node: LevelNode,
24) -> None:
25 title_texts = re.sub(
26 r"[\s\d]+$", "", clean_node(wxr, None, level_node.largs)
27 )
28 for title_text in re.split(r":|:|・", title_texts): 28 ↛ 99line 28 didn't jump to line 99 because the loop on line 28 didn't complete
29 if title_text in POS_DATA:
30 pre_len = len(page_data)
31 parse_pos_section(wxr, page_data, base_data, level_node, title_text)
32 if (
33 len(page_data) == pre_len
34 and title_text in LINKAGES
35 and pre_len > 0
36 ):
37 extract_linkage_section(
38 wxr, page_data[-1], level_node, LINKAGES[title_text]
39 )
40 break
41 elif (
42 title_text in ["語源", "由来", "字源", "出典"]
43 and wxr.config.capture_etymologies
44 ):
45 extract_etymology_section(wxr, page_data, base_data, level_node)
46 break
47 elif title_text.startswith("発音") and wxr.config.capture_pronunciation:
48 extract_sound_section(wxr, page_data, base_data, level_node)
49 break
50 elif title_text == "翻訳" and wxr.config.capture_translations: 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true
51 extract_translation_section(
52 wxr,
53 page_data[-1] if len(page_data) > 0 else base_data,
54 level_node,
55 )
56 break
57 elif title_text in LINKAGES and wxr.config.capture_linkages:
58 extract_linkage_section(
59 wxr,
60 page_data[-1]
61 if len(page_data) > 0
62 and page_data[-1].lang_code == base_data.lang_code
63 else base_data,
64 level_node,
65 LINKAGES[title_text],
66 )
67 break
68 elif title_text == "活用" and wxr.config.capture_inflections: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true
69 extract_conjugation_section(
70 wxr,
71 page_data[-1] if len(page_data) > 0 else base_data,
72 level_node,
73 )
74 break
75 elif title_text in [ 75 ↛ 79line 75 didn't jump to line 79 because the condition on line 75 was never true
76 "異表記",
77 "別表記",
78 ]: # "異表記・別形", Template:alter
79 extract_alt_form_section(
80 wxr,
81 page_data[-1]
82 if len(page_data) > 0
83 and page_data[-1].lang_code == base_data.lang_code
84 else base_data,
85 level_node,
86 )
87 break
88 elif title_text in ["用法", "注意点", "留意点", "注意"]: 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true
89 extract_note_section(
90 wxr,
91 page_data[-1] if len(page_data) > 0 else base_data,
92 level_node,
93 )
94 break
95 elif title_text == "同音異義語": 95 ↛ 28line 95 didn't jump to line 28 because the condition on line 95 was always true
96 extract_homophone_section(wxr, page_data, base_data, level_node)
97 break
98 else:
99 if title_text not in ["脚注", "参照", "参考文献", "参考"]:
100 wxr.wtp.debug(
101 f"Unknown section: {title_text}",
102 sortid="extractor/ja/page/parse_section/93",
103 )
105 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
106 parse_section(wxr, page_data, base_data, next_level)
108 for t_node in level_node.find_child(NodeKind.TEMPLATE):
109 if t_node.template_name.endswith("-cat"): 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true
110 clean_node(
111 wxr, page_data[-1] if len(page_data) > 0 else base_data, t_node
112 )
115def parse_page(
116 wxr: WiktextractContext, page_title: str, page_text: str
117) -> list[dict[str, Any]]:
118 # page layout
119 # https://ja.wiktionary.org/wiki/Wiktionary:スタイルマニュアル
120 if page_title.startswith(("Appendix:", "シソーラス:")): 120 ↛ 121line 120 didn't jump to line 121 because the condition on line 120 was never true
121 return []
122 wxr.wtp.start_page(page_title)
123 tree = wxr.wtp.parse(page_text)
124 page_data: list[WordEntry] = []
125 for level2_node in tree.find_child(NodeKind.LEVEL2):
126 lang_name = clean_node(wxr, None, level2_node.largs)
127 lang_code = name_to_code(lang_name, "ja")
128 if lang_code == "":
129 for template in level2_node.find_content(NodeKind.TEMPLATE):
130 if template.template_name == "L":
131 lang_code = template.template_parameters.get(1, "")
132 elif re.fullmatch(r"[a-z-]+", template.template_name): 132 ↛ 129line 132 didn't jump to line 129 because the condition on line 132 was always true
133 lang_code = template.template_name
134 if lang_code == "":
135 lang_code = "unknown"
136 wxr.wtp.start_section(lang_name)
137 base_data = WordEntry(
138 word=wxr.wtp.title,
139 lang_code=lang_code,
140 lang=lang_name,
141 pos="unknown",
142 )
143 for link_node in level2_node.find_child(NodeKind.LINK):
144 clean_node(wxr, base_data, link_node)
145 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
146 parse_section(wxr, page_data, base_data, level3_node)
148 for data in page_data:
149 if len(data.senses) == 0: 149 ↛ 150line 149 didn't jump to line 150 because the condition on line 149 was never true
150 data.senses.append(Sense(tags=["no-gloss"]))
151 return [m.model_dump(exclude_defaults=True) for m in page_data]