Coverage for src/wiktextract/extractor/en/descendant.py: 80%

1from copy import deepcopy

3from mediawiki_langcodes import name_to_code

4from wikitextprocessor import (

5 HTMLNode,

6 LevelNode,

7 NodeKind,

8 TemplateNode,

9 WikiNode,

10)

12from ...datautils import data_append, data_extend

13from ...page import clean_node

14from ...tags import valid_tags

15from ...wxr_context import WiktextractContext

16from ..ruby import extract_ruby

17from .type_utils import DescendantData, WordData

20def extract_descendant_section(

21 wxr: WiktextractContext,

22 word_entry: WordData,

23 level_node: LevelNode,

24 is_derived: bool,

25):

26 desc_list = []

27 for t_node in level_node.find_child(NodeKind.TEMPLATE):

28 if (

29 isinstance(t_node, TemplateNode)

30 and t_node.template_name.lower() == "cjkv"

31 ):

32 desc_list.extend(extract_cjkv_template(wxr, t_node))

34 seen_lists = set()

35 # get around unnecessarily pre-expanded "top" template

36 for list_node in level_node.find_child_recursively(NodeKind.LIST):

37 if list_node in seen_lists:

38 continue

39 seen_lists.add(list_node)

40 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

41 desc_list.extend(

42 extract_desc_list_item(wxr, list_item, [], seen_lists, [])[0]

43 )

45 if is_derived:

46 for data in desc_list:

47 if "derived" not in data.get("tags", []): 47 ↛ 46line 47 didn't jump to line 46 because the condition on line 47 was always true

48 data_append(data, "tags", "derived")

49 if len(desc_list) > 0:

50 data_extend(word_entry, "descendants", desc_list)

53def extract_cjkv_template(

54 wxr: WiktextractContext, t_node: TemplateNode

55) -> list[DescendantData]:

56 expanded_template = wxr.wtp.parse(

57 wxr.wtp.node_to_wikitext(t_node), expand_all=True

58 )

59 seen_lists = set()

60 desc_list = []

61 for list_node in expanded_template.find_child_recursively(NodeKind.LIST): 61 ↛ 62line 61 didn't jump to line 62 because the loop on line 61 never started

62 if list_node in seen_lists:

63 continue

64 seen_lists.add(list_node)

65 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

66 desc_list.extend(

67 extract_desc_list_item(wxr, list_item, [], seen_lists, [])[0]

68 )

69 return desc_list

72def extract_desc_list_item(

73 wxr: WiktextractContext,

74 list_item: WikiNode,

75 parent_data: list[DescendantData],

76 seen_lists: set[WikiNode],

77 raw_tags: list[str],

78 lang_code: str = "unknown",

79 lang_name: str = "unknown",

80) -> tuple[list[DescendantData], str, str]:

81 # process list item node and <li> tag

82 data_list = []

83 before_word_raw_tags = []

84 after_word = False

85 for child in list_item.children:

86 if isinstance(child, str) and child.strip().endswith(":"):

87 lang_name = child.strip(": \n") or "unknown"

88 lang_code = name_to_code(lang_name, "en") or "unknown"

89 elif isinstance(child, str) and child.strip() == ",":

90 after_word = False

91 elif isinstance(child, HTMLNode) and child.tag == "span":

92 after_word = extract_desc_span_tag(

93 wxr,

94 child,

95 data_list,

96 lang_code,

97 lang_name,

98 raw_tags,

99 before_word_raw_tags,

100 after_word,

101 )

102 elif ( 102 ↛ 107line 102 didn't jump to line 107 because the condition on line 102 was never true

103 isinstance(child, HTMLNode)

104 and child.tag == "i"

105 and len(data_list) > 0

106 ):

107 for span_tag in child.find_html(

108 "span", attr_name="class", attr_value="Latn"

109 ):

110 roman = clean_node(wxr, None, span_tag)

111 if roman != "":

112 data_list[-1]["roman"] = roman

113 if len(

114 data_list

115 ) > 1 and "Traditional-Chinese" in data_list[-2].get(

116 "tags", []

117 ):

118 data_list[-2]["roman"] = roman

119 elif isinstance(child, TemplateNode) and child.template_name in [

120 "desctree",

121 "descendants tree",

122 "desc",

123 "descendant",

124 "ja-r",

125 "zh-l",

126 "zh-m",

127 "link", # used in Reconstruction pages

128 "l",

129 ]:

130 if child.template_name.startswith("desc"):

131 lang_code = child.template_parameters.get(1, "") or "unknown"

132 expanded_template = wxr.wtp.parse(

133 wxr.wtp.node_to_wikitext(child), expand_all=True

134 )

135 new_data, new_l_code, new_l_name = extract_desc_list_item(

136 wxr,

137 expanded_template,

138 [], # avoid add twice

139 seen_lists,

140 raw_tags,

141 lang_code,

142 lang_name,

143 )

144 data_list.extend(new_data)

145 # save lang data from desc template

146 lang_code = new_l_code

147 lang_name = new_l_name

148

149 if (

150 wxr.wtp.title.startswith("Reconstruction:")

151 and len(data_list) == 0

152 and (lang_code != "unknown" or lang_name != "unknown")

153 ):

154 data = DescendantData(lang_code=lang_code, lang=lang_name)

155 if len(raw_tags) > 0:

156 data["raw_tags"] = raw_tags

157 data_list.append(data)

158

159 for ul_tag in list_item.find_html("ul"): 159 ↛ 160line 159 didn't jump to line 160 because the loop on line 159 never started

160 for li_tag in ul_tag.find_html("li"):

161 extract_desc_list_item(wxr, li_tag, data_list, seen_lists, [])

162 for next_list in list_item.find_child(NodeKind.LIST):

163 if next_list in seen_lists: 163 ↛ 164line 163 didn't jump to line 164 because the condition on line 163 was never true

164 continue

165 seen_lists.add(next_list)

166 for next_list_item in next_list.find_child(NodeKind.LIST_ITEM):

167 extract_desc_list_item(

168 wxr, next_list_item, data_list, seen_lists, []

169 )

170

171 for p_data in parent_data:

172 data_extend(p_data, "descendants", data_list)

173 return data_list, lang_code, lang_name

174

175

176def extract_desc_span_tag(

177 wxr: WiktextractContext,

178 span_tag: HTMLNode,

179 desc_lists: list[DescendantData],

180 lang_code: str,

181 lang_name: str,

182 raw_tags: list[str],

183 before_word_raw_tags: list[str],

184 after_word: bool,

185) -> bool:

186 class_names = span_tag.attrs.get("class", "").split()

187 span_lang = span_tag.attrs.get("lang", "")

188 span_title = span_tag.attrs.get("title", "")

189 if ("tr" in class_names or span_lang.endswith("-Latn")) and len(

190 desc_lists

191 ) > 0:

192 roman = clean_node(wxr, None, span_tag)

193 if roman != "": 193 ↛ 253line 193 didn't jump to line 253 because the condition on line 193 was always true

194 desc_lists[-1]["roman"] = clean_node(wxr, None, span_tag)

195 if len(desc_lists) > 1 and "Traditional-Chinese" in desc_lists[ 195 ↛ 198line 195 didn't jump to line 198 because the condition on line 195 was never true

196 -2

197 ].get("tags", []):

198 desc_lists[-2]["roman"] = roman

199 elif (

200 "qualifier-content" in class_names

201 or "gender" in class_names

202 or "label-content" in class_names

203 ) and len(desc_lists) > 0:

204 for raw_tag in clean_node(wxr, None, span_tag).split(","):

205 raw_tag = raw_tag.strip()

206 if raw_tag != "": 206 ↛ 204line 206 didn't jump to line 204 because the condition on line 206 was always true

207 if after_word:

208 data_append(

209 desc_lists[-1],

210 "tags" if raw_tag in valid_tags else "raw_tags",

211 raw_tag,

212 )

213 else:

214 before_word_raw_tags.append(raw_tag)

215 elif span_lang != "":

216 ruby_data, nodes_without_ruby = extract_ruby(wxr, span_tag)

217 desc_data = DescendantData(

218 lang=lang_name,

219 lang_code=lang_code,

220 word=clean_node(wxr, None, nodes_without_ruby),

221 )

222 for raw_tag_list in [before_word_raw_tags, raw_tags]:

223 for raw_tag in raw_tag_list:

224 data_append(

225 desc_data,

226 "tags" if raw_tag in valid_tags else "raw_tags",

227 raw_tag,

228 )

229 before_word_raw_tags.clear()

230 if len(ruby_data) > 0: 230 ↛ 231line 230 didn't jump to line 231 because the condition on line 230 was never true

231 desc_data["ruby"] = ruby_data

232 if desc_data["lang_code"] == "unknown":

233 desc_data["lang_code"] = span_lang

234 if "Hant" in class_names: 234 ↛ 235line 234 didn't jump to line 235 because the condition on line 234 was never true

235 data_append(desc_data, "tags", "Traditional-Chinese")

236 elif "Hans" in class_names: 236 ↛ 237line 236 didn't jump to line 237 because the condition on line 236 was never true

237 data_append(desc_data, "tags", "Simplified-Chinese")

238 if desc_data["word"] not in ["", "／"]: 238 ↛ 240line 238 didn't jump to line 240 because the condition on line 238 was always true

239 desc_lists.append(deepcopy(desc_data))

240 after_word = True

241 elif span_title != "" and clean_node(wxr, None, span_tag) in [

242 "→",

243 "⇒",

244 ">",

245 "?",

246 ]:

247 raw_tags.append(span_title)

248 elif "mention-gloss" in class_names and len(desc_lists) > 0:

249 sense = clean_node(wxr, None, span_tag)

250 if sense != "": 250 ↛ 253line 250 didn't jump to line 253 because the condition on line 250 was always true

251 desc_lists[-1]["sense"] = sense

252

253 return after_word

Coverage for src / wiktextract / extractor / en / descendant.py: 80%

124 statements