Coverage for src / wiktextract / extractor / share.py: 86%

103 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-11 04:48 +0000

1import hashlib 

2import re 

3from html import unescape 

4from typing import Iterable, Optional, Union 

5 

6from wikitextprocessor import NodeKind, WikiNode 

7 

8from wiktextract.wxr_context import WiktextractContext 

9 

10from ..page import clean_node 

11 

12 

13def strip_nodes( 

14 nodes: list[Union[WikiNode, str]], 

15) -> Iterable[Union[WikiNode, str]]: 

16 # filter nodes that only have newlines, white spaces and non-breaking spaces 

17 return filter( 

18 lambda node: isinstance(node, WikiNode) 

19 or (isinstance(node, str) and len(unescape(node).strip()) > 0), 

20 nodes, 

21 ) 

22 

23 

24def capture_text_in_parentheses(text: str) -> tuple[list[str], str]: 

25 """ 

26 Return a list of text inside parentheses, and the rest test. 

27 """ 

28 rest_parts = [] 

29 capture_text_list = [] 

30 last_group_end = 0 

31 for m in re.finditer(r"\([^()]+\)", text): 

32 not_captured = text[last_group_end : m.start()].strip() 

33 if len(not_captured) > 0: 

34 rest_parts.append(not_captured) 

35 last_group_end = m.end() 

36 group_text = m.group()[1:-1].strip() 

37 if len(group_text) > 0: 

38 capture_text_list.append(group_text) 

39 not_captured = text[last_group_end:].strip() 

40 if len(not_captured) > 0: 

41 rest_parts.append(not_captured) 

42 rest_text = " ".join(rest_parts) if len(rest_parts) > 0 else text 

43 return capture_text_list, rest_text 

44 

45 

46def split_chinese_variants(text: str) -> Iterable[tuple[Optional[str], str]]: 

47 """ 

48 Return Chinese character variant and text 

49 """ 

50 if "/" in text: 

51 splite_result = text.split("/") 

52 if len(splite_result) != 2: 

53 yield None, text 

54 else: 

55 for variant_index, variant in enumerate(splite_result): 

56 yield "zh-Hant" if variant_index == 0 else "zh-Hans", variant 

57 else: 

58 yield None, text 

59 

60 

61def create_audio_url_dict(filename: str) -> dict[str, str]: 

62 # remove white space and left-to-right mark 

63 filename = filename.strip(" \u200e") 

64 file_extension = filename[filename.rfind(".") + 1 :].lower() 

65 if file_extension == "ogv": 

66 # ".ogv" pages are redirected to ".oga" pages in Wikipedia Commons 

67 filename = filename[: filename.rfind(".")] + ".oga" 

68 file_extension = "oga" 

69 file_url_key = file_extension + "_url" 

70 filename_without_prefix = filename.removeprefix("File:") 

71 if len(filename_without_prefix) == 0: 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true

72 return {} 

73 audio_dict = { 

74 "audio": filename_without_prefix, 

75 file_url_key: "https://commons.wikimedia.org/wiki/Special:FilePath/" 

76 + filename_without_prefix, 

77 } 

78 transcode_formates = [] 

79 if file_extension not in ("oga", "ogg"): 

80 transcode_formates.append("ogg") 

81 if file_extension != "mp3": 81 ↛ 83line 81 didn't jump to line 83 because the condition on line 81 was always true

82 transcode_formates.append("mp3") 

83 for file_suffix in transcode_formates: 

84 audio_dict[f"{file_suffix}_url"] = create_transcode_url( 

85 filename_without_prefix.replace(" ", "_"), file_suffix 

86 ) 

87 return audio_dict 

88 

89 

90def create_transcode_url(filename: str, transcode_suffix: str) -> str: 

91 # Chinese Wiktionary template might expands filename that has the a lower 

92 # first letter but the actual Wikimedia Commons file's first letter is 

93 # capitalized 

94 filename = filename[0].upper() + filename[1:] 

95 md5 = hashlib.md5(filename.encode()).hexdigest() 

96 return ( 

97 "https://upload.wikimedia.org/wikipedia/commons/transcoded/" 

98 + f"{md5[0]}/{md5[:2]}/{filename}/{filename}.{transcode_suffix}" 

99 ) 

100 

101 

102def set_sound_file_url_fields( 

103 wxr: WiktextractContext, filename, pydantic_model 

104): 

105 file_data = create_audio_url_dict(filename) 

106 for key, value in file_data.items(): 

107 if hasattr(pydantic_model, key): 107 ↛ 110line 107 didn't jump to line 110 because the condition on line 107 was always true

108 setattr(pydantic_model, key, value) 

109 else: 

110 wxr.wtp.warning( 

111 f"{key=} not defined in Sound", 

112 sortid="extractor.share.set_sound_file_url_fields", 

113 ) 

114 

115 

116def split_senseids(senseids_str: str) -> list[str]: 

117 senseids = [] 

118 raw_ids = ( 

119 senseids_str.strip().removeprefix("[").removesuffix("]").split(",") 

120 ) 

121 for raw_id in raw_ids: 

122 range_split = raw_id.split("-") 

123 if len(range_split) == 1: 

124 senseids.append(raw_id.strip()) 

125 elif len(range_split) == 2: 125 ↛ 121line 125 didn't jump to line 121 because the condition on line 125 was always true

126 try: 

127 start = re.sub(r"[a-z]", "", range_split[0].strip()) 

128 end = re.sub(r"[a-z]", "", range_split[1].strip()) 

129 senseids.extend( 

130 [ 

131 str(id) 

132 for id in range( 

133 int(start), 

134 int(end) + 1, 

135 ) 

136 ] 

137 ) 

138 except Exception: 

139 pass 

140 

141 return senseids 

142 

143 

144def calculate_bold_offsets( 

145 wxr: WiktextractContext, 

146 node: WikiNode, 

147 node_text: str, 

148 example, 

149 field: str, 

150 extra_node_kind: NodeKind | None = None, 

151) -> None: 

152 offsets = [] 

153 bold_words = set() 

154 for b_tag in node.find_html_recursively("b"): 

155 bold_words.add(clean_node(wxr, None, b_tag)) 

156 for strong_tag in node.find_html_recursively("strong"): 

157 bold_words.add(clean_node(wxr, None, strong_tag)) 

158 for bold_node in node.find_child_recursively( 

159 NodeKind.BOLD 

160 if extra_node_kind is None 

161 else NodeKind.BOLD | extra_node_kind 

162 ): 

163 bold_words.add(clean_node(wxr, None, bold_node)) 

164 for link_node in node.find_child_recursively(NodeKind.LINK): 

165 if len(link_node.largs) > 0: 165 ↛ 164line 165 didn't jump to line 164 because the condition on line 165 was always true

166 link_dest = clean_node(wxr, None, link_node.largs[0]) 

167 if "#" in link_dest and not link_dest.startswith("#"): 

168 link_dest = link_dest[: link_dest.index("#")] 

169 if link_dest == wxr.wtp.title: 

170 link_text = clean_node(wxr, None, link_node) 

171 bold_words.add(link_text) 

172 

173 for bold_word in bold_words: 

174 for m in re.finditer(re.escape(bold_word), node_text): 

175 offsets.append((m.start(), m.end())) 

176 if len(offsets) > 0: 

177 if hasattr(example, field): # pydantic model 

178 setattr(example, field, sorted(offsets)) 

179 elif isinstance(example, dict): 179 ↛ exitline 179 didn't return from function 'calculate_bold_offsets' because the condition on line 179 was always true

180 example[field] = sorted(offsets) 

181 

182 

183