Coverage for src/wiktextract/extractor/ko/pos.py: 93%
97 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
1import re
3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .example import extract_example_list_item
8from .linkage import (
9 LINKAGE_TEMPLATES,
10 extract_linkage_list_item,
11 extract_linkage_template,
12)
13from .models import AltForm, Sense, WordEntry
14from .section_titles import LINKAGE_SECTIONS, POS_DATA
15from .sound import SOUND_TEMPLATES, extract_sound_template
16from .tags import translate_raw_tags
17from .translation import extract_translation_template
20def extract_pos_section(
21 wxr: WiktextractContext,
22 page_data: list[WordEntry],
23 base_data: WordEntry,
24 level_node: LevelNode,
25 pos_title: str,
26) -> None:
27 page_data.append(base_data.model_copy(deep=True))
28 orig_title = pos_title
29 pos_title = pos_title.removeprefix("보조 ").strip()
30 if pos_title in POS_DATA:
31 page_data[-1].pos_title = orig_title
32 pos_data = POS_DATA[pos_title]
33 page_data[-1].pos = pos_data["pos"]
34 page_data[-1].tags.extend(pos_data.get("tags", []))
35 if ( 35 ↛ 39line 35 didn't jump to line 39
36 orig_title.startswith("보조 ")
37 and "auxiliary" not in page_data[-1].tags
38 ):
39 page_data[-1].tags.append("auxiliary")
41 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE):
42 if isinstance(node, TemplateNode):
43 if node.template_name in SOUND_TEMPLATES: 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true
44 extract_sound_template(wxr, page_data[-1], node)
45 elif node.template_name in LINKAGE_TEMPLATES:
46 extract_linkage_template(wxr, page_data[-1], node)
47 elif node.template_name == "외국어": 47 ↛ 41line 47 didn't jump to line 41 because the condition on line 47 was always true
48 extract_translation_template(
49 wxr,
50 page_data[-1],
51 node,
52 page_data[-1].senses[-1].glosses[-1]
53 if len(page_data[-1].senses) > 0
54 else "",
55 )
56 elif node.kind == NodeKind.LIST: 56 ↛ 41line 56 didn't jump to line 41 because the condition on line 56 was always true
57 for list_item in node.find_child(NodeKind.LIST_ITEM):
58 if node.sarg.startswith("#"):
59 extract_gloss_list_item(wxr, page_data[-1], list_item)
60 else:
61 extract_unorderd_list_item(wxr, page_data[-1], list_item)
63 if len(page_data[-1].senses) == 0:
64 page_data.pop()
67def extract_gloss_list_item(
68 wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode
69) -> None:
70 gloss_nodes = []
71 sense = Sense()
72 for node in list_item.children:
73 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
74 gloss_text = clean_node(wxr, sense, gloss_nodes)
75 if len(gloss_text) > 0: 75 ↛ 79line 75 didn't jump to line 79 because the condition on line 75 was always true
76 sense.glosses.append(gloss_text)
77 word_entry.senses.append(sense)
78 gloss_nodes.clear()
79 for nested_list_item in node.find_child(NodeKind.LIST_ITEM):
80 extract_unorderd_list_item(wxr, word_entry, nested_list_item)
81 continue
82 elif isinstance(node, TemplateNode) and node.template_name.endswith(
83 " of"
84 ):
85 extract_form_of_template(wxr, sense, node)
86 gloss_nodes.append(node)
87 elif isinstance(node, TemplateNode) and node.template_name == "라벨":
88 sense.raw_tags.extend(
89 [
90 raw_tag.strip()
91 for raw_tag in clean_node(wxr, sense, node)
92 .strip("()")
93 .split(",")
94 ]
95 )
96 else:
97 gloss_nodes.append(node)
99 gloss_text = clean_node(wxr, sense, gloss_nodes)
100 if len(gloss_text) > 0:
101 sense.glosses.append(gloss_text)
102 translate_raw_tags(sense)
103 word_entry.senses.append(sense)
106def extract_unorderd_list_item(
107 wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode
108) -> None:
109 is_first_bold = True
110 for index, node in enumerate(list_item.children):
111 if (
112 isinstance(node, WikiNode)
113 and node.kind == NodeKind.BOLD
114 and is_first_bold
115 ):
116 # `* '''1.''' gloss text`, terrible obsolete layout
117 is_first_bold = False
118 bold_text = clean_node(wxr, None, node)
119 if re.fullmatch(r"\d+(?:-\d+)?\.?", bold_text):
120 new_list_item = WikiNode(NodeKind.LIST_ITEM, 0)
121 new_list_item.children = list_item.children[index + 1 :]
122 extract_gloss_list_item(wxr, word_entry, new_list_item)
123 break
124 elif isinstance(node, str) and "어원:" in node:
125 etymology_nodes = []
126 etymology_nodes.append(node[node.index(":") + 1 :])
127 etymology_nodes.extend(list_item.children[index + 1 :])
128 e_text = clean_node(wxr, None, etymology_nodes)
129 if len(e_text) > 0: 129 ↛ 131line 129 didn't jump to line 131 because the condition on line 129 was always true
130 word_entry.etymology_texts.append(e_text)
131 break
132 elif (
133 isinstance(node, str)
134 and ("참고:" in node or "참조:" in node)
135 and len(word_entry.senses) > 0
136 ):
137 sense = word_entry.senses[-1]
138 sense.note = node[node.index(":") + 1 :].strip()
139 sense.note += clean_node(
140 wxr, sense, list_item.children[index + 1 :]
141 )
142 break
143 elif (
144 isinstance(node, str)
145 and ":" in node
146 and node[: node.index(":")].strip() in LINKAGE_SECTIONS
147 ):
148 extract_linkage_list_item(wxr, word_entry, list_item, "")
149 break
150 else:
151 if len(word_entry.senses) > 0:
152 extract_example_list_item(
153 wxr, word_entry.senses[-1], list_item, word_entry.lang_code
154 )
157def extract_form_of_template(
158 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
159) -> None:
160 if "form-of" not in sense.tags: 160 ↛ 162line 160 didn't jump to line 162 because the condition on line 160 was always true
161 sense.tags.append("form-of")
162 word_arg = 1 if t_node.template_name == "ko-hanja form of" else 2
163 word = clean_node(wxr, None, t_node.template_parameters.get(word_arg, ""))
164 if len(word) > 0: 164 ↛ exitline 164 didn't return from function 'extract_form_of_template' because the condition on line 164 was always true
165 sense.form_of.append(AltForm(word=word))