Coverage for src/wiktextract/extractor/zh/descendant.py: 94%

1from mediawiki_langcodes import name_to_code

2from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode

4from ...page import clean_node

5from ...wxr_context import WiktextractContext

6from ..ruby import extract_ruby

7from .models import Descendant, WordEntry

8from .tags import translate_raw_tags

11def extract_descendant_section(

12 wxr: WiktextractContext, level_node: WikiNode, page_data: list[WordEntry]

13) -> None:

14 desc_list = []

15 for list_node in level_node.find_child(NodeKind.LIST):

16 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

17 for data in process_desc_list_item(wxr, list_item, []):

18 if data.word != "": 18 ↛ 17line 18 didn't jump to line 17 because the condition on line 18 was always true

19 desc_list.append(data)

20 for node in level_node.find_child(NodeKind.TEMPLATE):

21 if node.template_name.lower() == "cjkv": 21 ↛ 20line 21 didn't jump to line 20 because the condition on line 21 was always true

22 desc_list.extend(process_cjkv_template(wxr, node))

24 if level_node.kind == NodeKind.LEVEL3:

25 for data in page_data:

26 if data.lang_code == page_data[-1].lang_code: 26 ↛ 25line 26 didn't jump to line 25 because the condition on line 26 was always true

27 data.descendants.extend(desc_list)

28 elif len(page_data) > 0: 28 ↛ exitline 28 didn't return from function 'extract_descendant_section' because the condition on line 28 was always true

29 page_data[-1].descendants.extend(desc_list)

32def process_cjkv_template(

33 wxr: WiktextractContext, template_node: TemplateNode

34) -> list[Descendant]:

35 expanded_template = wxr.wtp.parse(

36 wxr.wtp.node_to_wikitext(template_node), expand_all=True

37 )

38 seen_lists = set()

39 desc_list = []

40 for list_node in expanded_template.find_child_recursively(NodeKind.LIST):

41 if list_node in seen_lists: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true

42 continue

43 seen_lists.add(list_node)

44 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

45 for data in process_desc_list_item(wxr, list_item, []):

46 if data.word != "": 46 ↛ 45line 46 didn't jump to line 45 because the condition on line 46 was always true

47 desc_list.append(data)

48 return desc_list

51def process_desc_list_item(

52 wxr: WiktextractContext,

53 list_item: WikiNode,

54 parent_data: list[Descendant],

55 lang_code: str = "",

56 lang_name: str = "",

57) -> list[Descendant]:

58 # process list item node and <li> tag

59 data_list = []

60 data = Descendant(lang=lang_name, lang_code=lang_code)

61 for child in list_item.children:

62 if isinstance(child, str) and child.strip().endswith(":"):

63 data.lang = child.strip(": ")

64 data.lang_code = name_to_code(data.lang, "zh")

65 elif isinstance(child, HTMLNode) and child.tag == "span":

66 class_names = child.attrs.get("class", "")

67 if "Latn" in class_names or "tr" in class_names:

68 data.roman = clean_node(wxr, None, child)

69 elif "qualifier-content" in class_names:

70 raw_tag = clean_node(wxr, None, clean_node(wxr, None, child))

71 if raw_tag != "": 71 ↛ 61line 71 didn't jump to line 61 because the condition on line 71 was always true

72 data.raw_tags.append(raw_tag)

73 elif isinstance(child, HTMLNode) and child.tag == "i":

74 for span_tag in child.find_html(

75 "span", attr_name="class", attr_value="Latn"

76 ):

77 data.roman = clean_node(wxr, None, span_tag)

78 elif isinstance(child, TemplateNode) and child.template_name in [

79 "desctree",

80 "descendants tree",

81 "desc",

82 "descendant",

83 "ja-r",

84 "zh-l",

85 ]:

86 if child.template_name.startswith("desc"):

87 data.lang_code = child.template_parameters.get(1)

88 expanded_template = wxr.wtp.parse(

89 wxr.wtp.node_to_wikitext(child), expand_all=True

90 )

91 for new_data in process_desc_list_item(

92 wxr,

93 expanded_template,

94 [], # avoid add twice

95 data.lang_code,

96 data.lang,

97 ):

98 if new_data.word != "":

99 data_list.append(new_data)

100 else: # save lang data from desc template

101 data = new_data

102

103 for span_tag in list_item.find_html("span", attr_name="lang"):

104 ruby_data, nodes_without_ruby = extract_ruby(wxr, span_tag)

105 old_word = data.word

106 data.word = clean_node(wxr, None, nodes_without_ruby)

107 data.ruby = ruby_data

108 if data.lang_code == "": 108 ↛ 109line 108 didn't jump to line 109 because the condition on line 108 was never true

109 data.lang_code = span_tag.attrs["lang"]

110 span_lang = span_tag.attrs["lang"]

111 if span_lang == "zh-Hant":

112 data.tags.append("Traditional Chinese")

113 elif span_lang == "zh-Hans":

114 if "Traditional Chinese" in data.tags: 114 ↛ 116line 114 didn't jump to line 116 because the condition on line 114 was always true

115 data.tags.remove("Traditional Chinese")

116 data.tags.append("Simplified Chinese")

117 if data.roman == data.word:

118 if old_word == "":

119 data.roman = ""

120 else: # roman tag also could have "lang"

121 continue

122 if data.word not in ["", "／"]:

123 data_list.append(data.model_copy(deep=True))

124

125 for ul_tag in list_item.find_html("ul"):

126 for li_tag in ul_tag.find_html("li"):

127 process_desc_list_item(wxr, li_tag, data_list)

128 for next_list in list_item.find_child(NodeKind.LIST):

129 for next_list_item in next_list.find_child(NodeKind.LIST_ITEM):

130 process_desc_list_item(wxr, next_list_item, data_list)

131

132 translate_raw_tags(data)

133 for p_data in parent_data:

134 p_data.descendants.extend(data_list)

135 if len(data_list) == 0 and data.lang != "":

136 # save lang name from desc template

137 data_list.append(data)

138 return data_list