Coverage for src/wiktextract/extractor/zh/descendant.py: 94%

94 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1from mediawiki_langcodes import name_to_code 

2from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode 

3 

4from ...page import clean_node 

5from ...wxr_context import WiktextractContext 

6from ..ruby import extract_ruby 

7from .models import Descendant, WordEntry 

8from .tags import translate_raw_tags 

9 

10 

11def extract_descendant_section( 

12 wxr: WiktextractContext, level_node: WikiNode, page_data: list[WordEntry] 

13) -> None: 

14 desc_list = [] 

15 for list_node in level_node.find_child(NodeKind.LIST): 

16 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

17 for data in process_desc_list_item(wxr, list_item, []): 

18 if data.word != "": 18 ↛ 17line 18 didn't jump to line 17 because the condition on line 18 was always true

19 desc_list.append(data) 

20 for node in level_node.find_child(NodeKind.TEMPLATE): 

21 if node.template_name.lower() == "cjkv": 21 ↛ 20line 21 didn't jump to line 20 because the condition on line 21 was always true

22 desc_list.extend(process_cjkv_template(wxr, node)) 

23 

24 if level_node.kind == NodeKind.LEVEL3: 

25 for data in page_data: 

26 if data.lang_code == page_data[-1].lang_code: 26 ↛ 25line 26 didn't jump to line 25 because the condition on line 26 was always true

27 data.descendants.extend(desc_list) 

28 elif len(page_data) > 0: 28 ↛ exitline 28 didn't return from function 'extract_descendant_section' because the condition on line 28 was always true

29 page_data[-1].descendants.extend(desc_list) 

30 

31 

32def process_cjkv_template( 

33 wxr: WiktextractContext, template_node: TemplateNode 

34) -> list[Descendant]: 

35 expanded_template = wxr.wtp.parse( 

36 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

37 ) 

38 seen_lists = set() 

39 desc_list = [] 

40 for list_node in expanded_template.find_child_recursively(NodeKind.LIST): 

41 if list_node in seen_lists: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true

42 continue 

43 seen_lists.add(list_node) 

44 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

45 for data in process_desc_list_item(wxr, list_item, []): 

46 if data.word != "": 46 ↛ 45line 46 didn't jump to line 45 because the condition on line 46 was always true

47 desc_list.append(data) 

48 return desc_list 

49 

50 

51def process_desc_list_item( 

52 wxr: WiktextractContext, 

53 list_item: WikiNode, 

54 parent_data: list[Descendant], 

55 lang_code: str = "", 

56 lang_name: str = "", 

57) -> list[Descendant]: 

58 # process list item node and <li> tag 

59 data_list = [] 

60 data = Descendant(lang=lang_name, lang_code=lang_code) 

61 for child in list_item.children: 

62 if isinstance(child, str) and child.strip().endswith(":"): 

63 data.lang = child.strip(": ") 

64 data.lang_code = name_to_code(data.lang, "zh") 

65 elif isinstance(child, HTMLNode) and child.tag == "span": 

66 class_names = child.attrs.get("class", "") 

67 if "Latn" in class_names or "tr" in class_names: 

68 data.roman = clean_node(wxr, None, child) 

69 elif "qualifier-content" in class_names: 

70 raw_tag = clean_node(wxr, None, clean_node(wxr, None, child)) 

71 if raw_tag != "": 71 ↛ 61line 71 didn't jump to line 61 because the condition on line 71 was always true

72 data.raw_tags.append(raw_tag) 

73 elif isinstance(child, HTMLNode) and child.tag == "i": 

74 for span_tag in child.find_html( 

75 "span", attr_name="class", attr_value="Latn" 

76 ): 

77 data.roman = clean_node(wxr, None, span_tag) 

78 elif isinstance(child, TemplateNode) and child.template_name in [ 

79 "desctree", 

80 "descendants tree", 

81 "desc", 

82 "descendant", 

83 "ja-r", 

84 "zh-l", 

85 ]: 

86 if child.template_name.startswith("desc"): 

87 data.lang_code = child.template_parameters.get(1) 

88 expanded_template = wxr.wtp.parse( 

89 wxr.wtp.node_to_wikitext(child), expand_all=True 

90 ) 

91 for new_data in process_desc_list_item( 

92 wxr, 

93 expanded_template, 

94 [], # avoid add twice 

95 data.lang_code, 

96 data.lang, 

97 ): 

98 if new_data.word != "": 

99 data_list.append(new_data) 

100 else: # save lang data from desc template 

101 data = new_data 

102 

103 for span_tag in list_item.find_html("span", attr_name="lang"): 

104 ruby_data, nodes_without_ruby = extract_ruby(wxr, span_tag) 

105 old_word = data.word 

106 data.word = clean_node(wxr, None, nodes_without_ruby) 

107 data.ruby = ruby_data 

108 if data.lang_code == "": 108 ↛ 109line 108 didn't jump to line 109 because the condition on line 108 was never true

109 data.lang_code = span_tag.attrs["lang"] 

110 span_lang = span_tag.attrs["lang"] 

111 if span_lang == "zh-Hant": 

112 data.tags.append("Traditional Chinese") 

113 elif span_lang == "zh-Hans": 

114 if "Traditional Chinese" in data.tags: 114 ↛ 116line 114 didn't jump to line 116 because the condition on line 114 was always true

115 data.tags.remove("Traditional Chinese") 

116 data.tags.append("Simplified Chinese") 

117 if data.roman == data.word: 

118 if old_word == "": 

119 data.roman = "" 

120 else: # roman tag also could have "lang" 

121 continue 

122 if data.word not in ["", "/"]: 

123 data_list.append(data.model_copy(deep=True)) 

124 

125 for ul_tag in list_item.find_html("ul"): 

126 for li_tag in ul_tag.find_html("li"): 

127 process_desc_list_item(wxr, li_tag, data_list) 

128 for next_list in list_item.find_child(NodeKind.LIST): 

129 for next_list_item in next_list.find_child(NodeKind.LIST_ITEM): 

130 process_desc_list_item(wxr, next_list_item, data_list) 

131 

132 translate_raw_tags(data) 

133 for p_data in parent_data: 

134 p_data.descendants.extend(data_list) 

135 if len(data_list) == 0 and data.lang != "": 

136 # save lang name from desc template 

137 data_list.append(data) 

138 return data_list