Coverage for src/wiktextract/extractor/zh/descendant.py: 94%

92 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1from mediawiki_langcodes import name_to_code 

2from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode 

3 

4from ...page import clean_node 

5from ...wxr_context import WiktextractContext 

6from ..ruby import extract_ruby 

7from .models import Descendant, WordEntry 

8from .tags import translate_raw_tags 

9 

10 

11def extract_descendant_section( 

12 wxr: WiktextractContext, level_node: WikiNode, page_data: list[WordEntry] 

13) -> None: 

14 desc_list = [] 

15 for list_node in level_node.find_child(NodeKind.LIST): 

16 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

17 for data in process_desc_list_item(wxr, list_item, []): 

18 if data.word != "": 18 ↛ 17line 18 didn't jump to line 17 because the condition on line 18 was always true

19 desc_list.append(data) 

20 for node in level_node.find_child(NodeKind.TEMPLATE): 

21 if node.template_name.lower() == "cjkv": 21 ↛ 20line 21 didn't jump to line 20 because the condition on line 21 was always true

22 desc_list.extend(process_cjkv_template(wxr, node)) 

23 

24 page_data[-1].descendants.extend(desc_list) 

25 for data in page_data[:-1]: 

26 if ( 26 ↛ 25line 26 didn't jump to line 25 because the condition on line 26 was always true

27 data.lang_code == page_data[-1].lang_code 

28 and data.sounds == page_data[-1].sounds 

29 and data.etymology_text == page_data[-1].etymology_text 

30 and data.pos_level == page_data[-1].pos_level == level_node.kind 

31 ): 

32 data.descendants.extend(desc_list) 

33 

34 

35def process_cjkv_template( 

36 wxr: WiktextractContext, template_node: TemplateNode 

37) -> list[Descendant]: 

38 expanded_template = wxr.wtp.parse( 

39 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

40 ) 

41 seen_lists = set() 

42 desc_list = [] 

43 for list_node in expanded_template.find_child_recursively(NodeKind.LIST): 

44 if list_node in seen_lists: 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true

45 continue 

46 seen_lists.add(list_node) 

47 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

48 for data in process_desc_list_item(wxr, list_item, []): 

49 if data.word != "": 49 ↛ 48line 49 didn't jump to line 48 because the condition on line 49 was always true

50 desc_list.append(data) 

51 return desc_list 

52 

53 

54def process_desc_list_item( 

55 wxr: WiktextractContext, 

56 list_item: WikiNode, 

57 parent_data: list[Descendant], 

58 lang_code: str = "", 

59 lang_name: str = "", 

60) -> list[Descendant]: 

61 # process list item node and <li> tag 

62 data_list = [] 

63 data = Descendant(lang=lang_name, lang_code=lang_code) 

64 for child in list_item.children: 

65 if isinstance(child, str) and child.strip().endswith(":"): 

66 data.lang = child.strip(": ") 

67 data.lang_code = name_to_code(data.lang, "zh") 

68 elif isinstance(child, HTMLNode) and child.tag == "span": 

69 class_names = child.attrs.get("class", "") 

70 if "Latn" in class_names or "tr" in class_names: 

71 data.roman = clean_node(wxr, None, child) 

72 elif "qualifier-content" in class_names: 

73 raw_tag = clean_node(wxr, None, clean_node(wxr, None, child)) 

74 if raw_tag != "": 74 ↛ 64line 74 didn't jump to line 64 because the condition on line 74 was always true

75 data.raw_tags.append(raw_tag) 

76 elif isinstance(child, HTMLNode) and child.tag == "i": 

77 for span_tag in child.find_html( 

78 "span", attr_name="class", attr_value="Latn" 

79 ): 

80 data.roman = clean_node(wxr, None, span_tag) 

81 elif isinstance(child, TemplateNode) and child.template_name in [ 

82 "desctree", 

83 "descendants tree", 

84 "desc", 

85 "descendant", 

86 "ja-r", 

87 "zh-l", 

88 ]: 

89 if child.template_name.startswith("desc"): 

90 data.lang_code = child.template_parameters.get(1) 

91 expanded_template = wxr.wtp.parse( 

92 wxr.wtp.node_to_wikitext(child), expand_all=True 

93 ) 

94 for new_data in process_desc_list_item( 

95 wxr, 

96 expanded_template, 

97 [], # avoid add twice 

98 data.lang_code, 

99 data.lang, 

100 ): 

101 if new_data.word != "": 

102 data_list.append(new_data) 

103 else: # save lang data from desc template 

104 data = new_data 

105 

106 for span_tag in list_item.find_html("span", attr_name="lang"): 

107 ruby_data, nodes_without_ruby = extract_ruby(wxr, span_tag) 

108 old_word = data.word 

109 data.word = clean_node(wxr, None, nodes_without_ruby) 

110 data.ruby = ruby_data 

111 if data.lang_code == "": 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true

112 data.lang_code = span_tag.attrs["lang"] 

113 span_lang = span_tag.attrs["lang"] 

114 if span_lang == "zh-Hant": 

115 data.tags.append("Traditional-Chinese") 

116 elif span_lang == "zh-Hans": 

117 if "Traditional-Chinese" in data.tags: 117 ↛ 119line 117 didn't jump to line 119 because the condition on line 117 was always true

118 data.tags.remove("Traditional-Chinese") 

119 data.tags.append("Simplified-Chinese") 

120 if data.roman == data.word: 

121 if old_word == "": 

122 data.roman = "" 

123 else: # roman tag also could have "lang" 

124 continue 

125 if data.word not in ["", "/"]: 

126 data_list.append(data.model_copy(deep=True)) 

127 

128 for ul_tag in list_item.find_html("ul"): 

129 for li_tag in ul_tag.find_html("li"): 

130 process_desc_list_item(wxr, li_tag, data_list) 

131 for next_list in list_item.find_child(NodeKind.LIST): 

132 for next_list_item in next_list.find_child(NodeKind.LIST_ITEM): 

133 process_desc_list_item(wxr, next_list_item, data_list) 

134 

135 translate_raw_tags(data) 

136 for p_data in parent_data: 

137 p_data.descendants.extend(data_list) 

138 if len(data_list) == 0 and data.lang != "": 

139 # save lang name from desc template 

140 data_list.append(data) 

141 return data_list