Coverage for src/wiktextract/extractor/share.py: 84%

102 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1import hashlib 

2import re 

3from html import unescape 

4from typing import Iterable, Optional, Union 

5 

6from wikitextprocessor import NodeKind, WikiNode 

7 

8 

9def strip_nodes( 

10 nodes: list[Union[WikiNode, str]], 

11) -> Iterable[Union[WikiNode, str]]: 

12 # filter nodes that only have newlines, white spaces and non-breaking spaces 

13 return filter( 

14 lambda node: isinstance(node, WikiNode) 

15 or (isinstance(node, str) and len(unescape(node).strip()) > 0), 

16 nodes, 

17 ) 

18 

19 

20def capture_text_in_parentheses(text: str) -> tuple[list[str], str]: 

21 """ 

22 Return a list of text inside parentheses, and the rest test. 

23 """ 

24 rest_parts = [] 

25 capture_text_list = [] 

26 last_group_end = 0 

27 for m in re.finditer(r"\([^()]+\)", text): 

28 not_captured = text[last_group_end : m.start()].strip() 

29 if len(not_captured) > 0: 29 ↛ 30line 29 didn't jump to line 30 because the condition on line 29 was never true

30 rest_parts.append(not_captured) 

31 last_group_end = m.end() 

32 group_text = m.group()[1:-1].strip() 

33 if len(group_text) > 0: 

34 capture_text_list.append(group_text) 

35 not_captured = text[last_group_end:].strip() 

36 if len(not_captured) > 0: 

37 rest_parts.append(not_captured) 

38 rest_text = " ".join(rest_parts) if len(rest_parts) > 0 else text 

39 return capture_text_list, rest_text 

40 

41 

42def split_chinese_variants(text: str) -> Iterable[tuple[Optional[str], str]]: 

43 """ 

44 Return Chinese character variant and text 

45 """ 

46 if "/" in text: 

47 splite_result = text.split("/") 

48 if len(splite_result) != 2: 

49 yield None, text 

50 else: 

51 for variant_index, variant in enumerate(splite_result): 

52 yield "zh-Hant" if variant_index == 0 else "zh-Hans", variant 

53 else: 

54 yield None, text 

55 

56 

57def create_audio_url_dict(filename: str) -> dict[str, str]: 

58 # remove white space and left-to-right mark 

59 filename = filename.strip(" \u200e") 

60 file_extension = filename[filename.rfind(".") + 1 :].lower() 

61 if file_extension == "ogv": 

62 # ".ogv" pages are redirected to ".oga" pages in Wikipedia Commons 

63 filename = filename[: filename.rfind(".")] + ".oga" 

64 file_extension = "oga" 

65 file_url_key = file_extension + "_url" 

66 filename_without_prefix = filename.removeprefix("File:") 

67 if len(filename_without_prefix) == 0: 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true

68 return {} 

69 audio_dict = { 

70 "audio": filename_without_prefix, 

71 file_url_key: "https://commons.wikimedia.org/wiki/Special:FilePath/" 

72 + filename_without_prefix, 

73 } 

74 transcode_formates = [] 

75 if file_extension not in ("oga", "ogg"): 

76 transcode_formates.append("ogg") 

77 if file_extension != "mp3": 77 ↛ 79line 77 didn't jump to line 79 because the condition on line 77 was always true

78 transcode_formates.append("mp3") 

79 for file_suffix in transcode_formates: 

80 audio_dict[f"{file_suffix}_url"] = create_transcode_url( 

81 filename_without_prefix.replace(" ", "_"), file_suffix 

82 ) 

83 return audio_dict 

84 

85 

86def create_transcode_url(filename: str, transcode_suffix: str) -> str: 

87 # Chinese Wiktionary template might expands filename that has the a lower 

88 # first letter but the actual Wikimedia Commons file's first letter is 

89 # capitalized 

90 filename = filename[0].upper() + filename[1:] 

91 md5 = hashlib.md5(filename.encode()).hexdigest() 

92 return ( 

93 "https://upload.wikimedia.org/wikipedia/commons/transcoded/" 

94 + f"{md5[0]}/{md5[:2]}/{filename}/{filename}.{transcode_suffix}" 

95 ) 

96 

97 

98def set_sound_file_url_fields(wxr, filename, pydantic_model): 

99 file_data = create_audio_url_dict(filename) 

100 for key, value in file_data.items(): 

101 if hasattr(pydantic_model, key): 101 ↛ 104line 101 didn't jump to line 104 because the condition on line 101 was always true

102 setattr(pydantic_model, key, value) 

103 else: 

104 wxr.wtp.warning( 

105 f"{key=} not defined in Sound", 

106 sortid="extractor.share.set_sound_file_url_fields", 

107 ) 

108 

109 

110def split_senseids(senseids_str: str) -> list[str]: 

111 senseids = [] 

112 raw_ids = ( 

113 senseids_str.strip().removeprefix("[").removesuffix("]").split(",") 

114 ) 

115 for raw_id in raw_ids: 

116 range_split = raw_id.split("-") 

117 if len(range_split) == 1: 

118 senseids.append(raw_id.strip()) 

119 elif len(range_split) == 2: 119 ↛ 115line 119 didn't jump to line 115 because the condition on line 119 was always true

120 try: 

121 start = re.sub(r"[a-z]", "", range_split[0].strip()) 

122 end = re.sub(r"[a-z]", "", range_split[1].strip()) 

123 senseids.extend( 

124 [ 

125 str(id) 

126 for id in range( 

127 int(start), 

128 int(end) + 1, 

129 ) 

130 ] 

131 ) 

132 except Exception: 

133 pass 

134 

135 return senseids 

136 

137 

138def calculate_bold_offsets( 

139 wxr, 

140 node: WikiNode, 

141 node_text: str, 

142 example, 

143 field: str, 

144 extra_node_kind: NodeKind | None = None, 

145) -> None: 

146 from ..page import clean_node 

147 

148 offsets = [] 

149 bold_words = set() 

150 for b_tag in node.find_html_recursively("b"): 

151 bold_words.add(clean_node(wxr, None, b_tag)) 

152 for strong_tag in node.find_html_recursively("strong"): 

153 bold_words.add(clean_node(wxr, None, strong_tag)) 

154 for bold_node in node.find_child_recursively( 

155 NodeKind.BOLD 

156 if extra_node_kind is None 

157 else NodeKind.BOLD | extra_node_kind 

158 ): 

159 bold_words.add(clean_node(wxr, None, bold_node)) 

160 for link_node in node.find_child_recursively(NodeKind.LINK): 

161 if len(link_node.largs) > 0: 161 ↛ 160line 161 didn't jump to line 160 because the condition on line 161 was always true

162 link_dest = clean_node(wxr, None, link_node.largs[0]) 

163 if "#" in link_dest and not link_dest.startswith("#"): 

164 link_dest = link_dest[:link_dest.index("#")] 

165 if link_dest == wxr.wtp.title: 

166 link_text = clean_node(wxr, None, link_node) 

167 bold_words.add(link_text) 

168 

169 for bold_word in bold_words: 

170 for m in re.finditer(re.escape(bold_word), node_text): 

171 offsets.append((m.start(), m.end())) 

172 if len(offsets) > 0: 

173 if hasattr(example, field): # pydantic model 

174 setattr(example, field, sorted(offsets)) 

175 elif isinstance(example, dict): 175 ↛ exitline 175 didn't return from function 'calculate_bold_offsets' because the condition on line 175 was always true

176 example[field] = sorted(offsets)