Coverage for src/wiktextract/extractor/ja/page.py: 83%
67 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
2from typing import Any
4from mediawiki_langcodes import name_to_code
5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind
7from ...page import clean_node
8from ...wxr_context import WiktextractContext
9from .conjugation import extract_conjugation_section
10from .etymology import extract_etymology_section
11from .linkage import extract_linkage_section
12from .models import Sense, WordEntry
13from .pos import parse_pos_section
14from .section_titles import LINKAGES, POS_DATA
15from .sound import extract_sound_section
16from .translation import extract_translation_section
19def parse_section(
20 wxr: WiktextractContext,
21 page_data: list[WordEntry],
22 base_data: WordEntry,
23 level_node: LevelNode,
24) -> None:
25 title_texts = clean_node(wxr, None, level_node.largs)
26 for title_text in re.split(r":|:|・", title_texts): 26 ↛ 67line 26 didn't jump to line 67 because the loop on line 26 didn't complete
27 if title_text in POS_DATA:
28 pre_len = len(page_data)
29 parse_pos_section(wxr, page_data, base_data, level_node, title_text)
30 if (
31 len(page_data) == pre_len
32 and title_text in LINKAGES
33 and pre_len > 0
34 ):
35 extract_linkage_section(
36 wxr, page_data[-1], level_node, LINKAGES[title_text]
37 )
38 break
39 elif title_text in ["語源", "由来"] and wxr.config.capture_etymologies:
40 extract_etymology_section(wxr, page_data, base_data, level_node)
41 break
42 elif title_text.startswith("発音") and wxr.config.capture_pronunciation:
43 extract_sound_section(wxr, page_data, base_data, level_node)
44 break
45 elif title_text == "翻訳" and wxr.config.capture_translations: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true
46 extract_translation_section(
47 wxr,
48 page_data[-1] if len(page_data) > 0 else base_data,
49 level_node,
50 )
51 break
52 elif title_text in LINKAGES and wxr.config.capture_linkages: 52 ↛ 60line 52 didn't jump to line 60 because the condition on line 52 was always true
53 extract_linkage_section(
54 wxr,
55 page_data[-1] if len(page_data) > 0 else base_data,
56 level_node,
57 LINKAGES[title_text],
58 )
59 break
60 elif title_text == "活用" and wxr.config.capture_inflections:
61 extract_conjugation_section(
62 wxr,
63 page_data[-1] if len(page_data) > 0 else base_data,
64 level_node,
65 )
67 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
68 parse_section(wxr, page_data, base_data, next_level)
70 for t_node in level_node.find_child(NodeKind.TEMPLATE):
71 if t_node.template_name.endswith("-cat"): 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true
72 clean_node(
73 wxr, page_data[-1] if len(page_data) > 0 else base_data, t_node
74 )
77def parse_page(
78 wxr: WiktextractContext, page_title: str, page_text: str
79) -> list[dict[str, Any]]:
80 # page layout
81 # https://ja.wiktionary.org/wiki/Wiktionary:スタイルマニュアル
82 wxr.wtp.start_page(page_title)
83 tree = wxr.wtp.parse(page_text)
84 page_data: list[WordEntry] = []
85 for level2_node in tree.find_child(NodeKind.LEVEL2):
86 lang_name = clean_node(wxr, None, level2_node.largs)
87 lang_code = name_to_code(lang_name, "ja")
88 if lang_code == "":
89 for template in level2_node.find_content(NodeKind.TEMPLATE):
90 if template.template_name == "L": 90 ↛ 91line 90 didn't jump to line 91 because the condition on line 90 was never true
91 lang_code = template.template_parameters.get(1, "")
92 elif re.fullmatch(r"[a-z-]+", template.template_name): 92 ↛ 89line 92 didn't jump to line 89 because the condition on line 92 was always true
93 lang_code = template.template_name
94 if lang_code == "": 94 ↛ 95line 94 didn't jump to line 95 because the condition on line 94 was never true
95 lang_code = "unknown"
96 wxr.wtp.start_section(lang_name)
97 base_data = WordEntry(
98 word=wxr.wtp.title,
99 lang_code=lang_code,
100 lang=lang_name,
101 pos="unknown",
102 )
103 for link_node in level2_node.find_child(NodeKind.LINK):
104 clean_node(wxr, base_data, link_node)
105 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
106 parse_section(wxr, page_data, base_data, level3_node)
108 for data in page_data:
109 if len(data.senses) == 0: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true
110 data.senses.append(Sense(tags=["no-gloss"]))
111 return [m.model_dump(exclude_defaults=True) for m in page_data]