Coverage for src/wiktextract/extractor/zh/descendant.py: 94%
94 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from mediawiki_langcodes import name_to_code
2from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode
4from ...page import clean_node
5from ...wxr_context import WiktextractContext
6from ..ruby import extract_ruby
7from .models import Descendant, WordEntry
8from .tags import translate_raw_tags
11def extract_descendant_section(
12 wxr: WiktextractContext, level_node: WikiNode, page_data: list[WordEntry]
13) -> None:
14 desc_list = []
15 for list_node in level_node.find_child(NodeKind.LIST):
16 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
17 for data in process_desc_list_item(wxr, list_item, []):
18 if data.word != "": 18 ↛ 17line 18 didn't jump to line 17 because the condition on line 18 was always true
19 desc_list.append(data)
20 for node in level_node.find_child(NodeKind.TEMPLATE):
21 if node.template_name.lower() == "cjkv": 21 ↛ 20line 21 didn't jump to line 20 because the condition on line 21 was always true
22 desc_list.extend(process_cjkv_template(wxr, node))
24 if level_node.kind == NodeKind.LEVEL3:
25 for data in page_data:
26 if data.lang_code == page_data[-1].lang_code: 26 ↛ 25line 26 didn't jump to line 25 because the condition on line 26 was always true
27 data.descendants.extend(desc_list)
28 elif len(page_data) > 0: 28 ↛ exitline 28 didn't return from function 'extract_descendant_section' because the condition on line 28 was always true
29 page_data[-1].descendants.extend(desc_list)
32def process_cjkv_template(
33 wxr: WiktextractContext, template_node: TemplateNode
34) -> list[Descendant]:
35 expanded_template = wxr.wtp.parse(
36 wxr.wtp.node_to_wikitext(template_node), expand_all=True
37 )
38 seen_lists = set()
39 desc_list = []
40 for list_node in expanded_template.find_child_recursively(NodeKind.LIST):
41 if list_node in seen_lists: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true
42 continue
43 seen_lists.add(list_node)
44 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
45 for data in process_desc_list_item(wxr, list_item, []):
46 if data.word != "": 46 ↛ 45line 46 didn't jump to line 45 because the condition on line 46 was always true
47 desc_list.append(data)
48 return desc_list
51def process_desc_list_item(
52 wxr: WiktextractContext,
53 list_item: WikiNode,
54 parent_data: list[Descendant],
55 lang_code: str = "",
56 lang_name: str = "",
57) -> list[Descendant]:
58 # process list item node and <li> tag
59 data_list = []
60 data = Descendant(lang=lang_name, lang_code=lang_code)
61 for child in list_item.children:
62 if isinstance(child, str) and child.strip().endswith(":"):
63 data.lang = child.strip(": ")
64 data.lang_code = name_to_code(data.lang, "zh")
65 elif isinstance(child, HTMLNode) and child.tag == "span":
66 class_names = child.attrs.get("class", "")
67 if "Latn" in class_names or "tr" in class_names:
68 data.roman = clean_node(wxr, None, child)
69 elif "qualifier-content" in class_names:
70 raw_tag = clean_node(wxr, None, clean_node(wxr, None, child))
71 if raw_tag != "": 71 ↛ 61line 71 didn't jump to line 61 because the condition on line 71 was always true
72 data.raw_tags.append(raw_tag)
73 elif isinstance(child, HTMLNode) and child.tag == "i":
74 for span_tag in child.find_html(
75 "span", attr_name="class", attr_value="Latn"
76 ):
77 data.roman = clean_node(wxr, None, span_tag)
78 elif isinstance(child, TemplateNode) and child.template_name in [
79 "desctree",
80 "descendants tree",
81 "desc",
82 "descendant",
83 "ja-r",
84 "zh-l",
85 ]:
86 if child.template_name.startswith("desc"):
87 data.lang_code = child.template_parameters.get(1)
88 expanded_template = wxr.wtp.parse(
89 wxr.wtp.node_to_wikitext(child), expand_all=True
90 )
91 for new_data in process_desc_list_item(
92 wxr,
93 expanded_template,
94 [], # avoid add twice
95 data.lang_code,
96 data.lang,
97 ):
98 if new_data.word != "":
99 data_list.append(new_data)
100 else: # save lang data from desc template
101 data = new_data
103 for span_tag in list_item.find_html("span", attr_name="lang"):
104 ruby_data, nodes_without_ruby = extract_ruby(wxr, span_tag)
105 old_word = data.word
106 data.word = clean_node(wxr, None, nodes_without_ruby)
107 data.ruby = ruby_data
108 if data.lang_code == "": 108 ↛ 109line 108 didn't jump to line 109 because the condition on line 108 was never true
109 data.lang_code = span_tag.attrs["lang"]
110 span_lang = span_tag.attrs["lang"]
111 if span_lang == "zh-Hant":
112 data.tags.append("Traditional Chinese")
113 elif span_lang == "zh-Hans":
114 if "Traditional Chinese" in data.tags: 114 ↛ 116line 114 didn't jump to line 116 because the condition on line 114 was always true
115 data.tags.remove("Traditional Chinese")
116 data.tags.append("Simplified Chinese")
117 if data.roman == data.word:
118 if old_word == "":
119 data.roman = ""
120 else: # roman tag also could have "lang"
121 continue
122 if data.word not in ["", "/"]:
123 data_list.append(data.model_copy(deep=True))
125 for ul_tag in list_item.find_html("ul"):
126 for li_tag in ul_tag.find_html("li"):
127 process_desc_list_item(wxr, li_tag, data_list)
128 for next_list in list_item.find_child(NodeKind.LIST):
129 for next_list_item in next_list.find_child(NodeKind.LIST_ITEM):
130 process_desc_list_item(wxr, next_list_item, data_list)
132 translate_raw_tags(data)
133 for p_data in parent_data:
134 p_data.descendants.extend(data_list)
135 if len(data_list) == 0 and data.lang != "":
136 # save lang name from desc template
137 data_list.append(data)
138 return data_list