Coverage for src/wiktextract/extractor/tr/page.py: 93%
60 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1import string
2from typing import Any
4from mediawiki_langcodes import name_to_code
5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind
7from ...page import clean_node
8from ...wxr_context import WiktextractContext
9from .etymology import extract_etymology_section
10from .inflection import extract_inflection_section
11from .linkage import extract_linkage_section
12from .models import Sense, WordEntry
13from .pos import extract_note_section, extract_pos_section
14from .section_titles import LINKAGE_SECTIONS, LINKAGE_TAGS, POS_DATA
15from .sound import extract_sound_section
16from .translation import extract_translation_section
19def parse_section(
20 wxr: WiktextractContext,
21 page_data: list[WordEntry],
22 base_data: WordEntry,
23 level_node: LevelNode,
24) -> None:
25 title_text = clean_node(wxr, None, level_node.largs)
26 wxr.wtp.start_subsection(title_text)
27 title_text = title_text.rstrip(string.digits + string.whitespace)
28 if title_text in POS_DATA:
29 extract_pos_section(wxr, page_data, base_data, level_node, title_text)
30 if len(page_data[-1].senses) == 0 and title_text in LINKAGE_SECTIONS:
31 page_data.pop()
32 extract_linkage_section(
33 wxr,
34 page_data[-1] if len(page_data) > 0 else base_data,
35 level_node,
36 LINKAGE_SECTIONS[title_text],
37 LINKAGE_TAGS.get(title_text, []),
38 )
39 elif title_text == "Köken":
40 if level_node.contain_node(LEVEL_KIND_FLAGS):
41 base_data = base_data.model_copy(deep=True)
42 extract_etymology_section(wxr, base_data, level_node)
43 elif title_text in ["Söyleniş", "Heceleme", "Söyleyiş"]:
44 if level_node.contain_node(LEVEL_KIND_FLAGS):
45 base_data = base_data.model_copy(deep=True)
46 extract_sound_section(
47 wxr,
48 page_data[-1]
49 if len(page_data) > 0
50 and page_data[-1].lang_code == base_data.lang_code
51 and not level_node.contain_node(LEVEL_KIND_FLAGS)
52 else base_data,
53 level_node,
54 )
55 elif title_text == "Çeviriler":
56 extract_translation_section(
57 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
58 )
59 elif title_text in LINKAGE_SECTIONS:
60 extract_linkage_section(
61 wxr,
62 page_data[-1] if len(page_data) > 0 else base_data,
63 level_node,
64 LINKAGE_SECTIONS[title_text],
65 LINKAGE_TAGS.get(title_text, []),
66 )
67 elif title_text == "Açıklamalar": 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true
68 extract_note_section(
69 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
70 )
71 elif title_text == "Çekimleme": 71 ↛ 75line 71 didn't jump to line 75 because the condition on line 71 was always true
72 extract_inflection_section(
73 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
74 )
75 elif title_text not in [
76 "Kaynakça",
77 "Ek okumalar",
78 "Kaynaklar",
79 "Dış Bağlantılar",
80 ]:
81 wxr.wtp.debug(
82 f"Unknown section: {title_text}",
83 sortid="extractor/tr/page/parse_section/70",
84 )
86 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
87 parse_section(wxr, page_data, base_data, next_level)
89 for link_node in level_node.find_child(NodeKind.LINK):
90 clean_node(
91 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node
92 )
95def parse_page(
96 wxr: WiktextractContext, page_title: str, page_text: str
97) -> list[dict[str, Any]]:
98 # page layout
99 # https://tr.wiktionary.org/wiki/Vikisözlük:Girdilerin_biçimi
100 wxr.wtp.start_page(page_title)
101 tree = wxr.wtp.parse(page_text, pre_expand=True)
102 page_data: list[WordEntry] = []
103 for level2_node in tree.find_child(NodeKind.LEVEL2):
104 lang_name = clean_node(wxr, None, level2_node.largs)
105 lang_code = name_to_code(lang_name, "id") or "unknown"
106 wxr.wtp.start_section(lang_name)
107 base_data = WordEntry(
108 word=wxr.wtp.title,
109 lang_code=lang_code,
110 lang=lang_name,
111 pos="unknown",
112 )
113 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS):
114 parse_section(wxr, page_data, base_data, next_level_node)
116 for data in page_data:
117 if len(data.senses) == 0:
118 data.senses.append(Sense(tags=["no-gloss"]))
119 return [m.model_dump(exclude_defaults=True) for m in page_data]