Coverage for src/wiktextract/extractor/share.py: 86%

1import hashlib

2import re

3from html import unescape

4from typing import Iterable, Optional, Union

6from wikitextprocessor import NodeKind, WikiNode

9def strip_nodes(

10 nodes: list[Union[WikiNode, str]],

11) -> Iterable[Union[WikiNode, str]]:

12 # filter nodes that only have newlines, white spaces and non-breaking spaces

13 return filter(

14 lambda node: isinstance(node, WikiNode)

15 or (isinstance(node, str) and len(unescape(node).strip()) > 0),

16 nodes,

17 )

20def capture_text_in_parentheses(text: str) -> tuple[list[str], str]:

21 """

22 Return a list of text inside parentheses, and the rest test.

23 """

24 rest_parts = []

25 capture_text_list = []

26 last_group_end = 0

27 for m in re.finditer(r"\([^()]+\)", text):

28 not_captured = text[last_group_end : m.start()].strip()

29 if len(not_captured) > 0:

30 rest_parts.append(not_captured)

31 last_group_end = m.end()

32 group_text = m.group()[1:-1].strip()

33 if len(group_text) > 0:

34 capture_text_list.append(group_text)

35 not_captured = text[last_group_end:].strip()

36 if len(not_captured) > 0:

37 rest_parts.append(not_captured)

38 rest_text = " ".join(rest_parts) if len(rest_parts) > 0 else text

39 return capture_text_list, rest_text

42def split_chinese_variants(text: str) -> Iterable[tuple[Optional[str], str]]:

43 """

44 Return Chinese character variant and text

45 """

46 if "／" in text:

47 splite_result = text.split("／")

48 if len(splite_result) != 2:

49 yield None, text

50 else:

51 for variant_index, variant in enumerate(splite_result):

52 yield "zh-Hant" if variant_index == 0 else "zh-Hans", variant

53 else:

54 yield None, text

57def create_audio_url_dict(filename: str) -> dict[str, str]:

58 # remove white space and left-to-right mark

59 filename = filename.strip(" \u200e")

60 file_extension = filename[filename.rfind(".") + 1 :].lower()

61 if file_extension == "ogv":

62 # ".ogv" pages are redirected to ".oga" pages in Wikipedia Commons

63 filename = filename[: filename.rfind(".")] + ".oga"

64 file_extension = "oga"

65 file_url_key = file_extension + "_url"

66 filename_without_prefix = filename.removeprefix("File:")

67 if len(filename_without_prefix) == 0: 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true

68 return {}

69 audio_dict = {

70 "audio": filename_without_prefix,

71 file_url_key: "https://commons.wikimedia.org/wiki/Special:FilePath/"

72 + filename_without_prefix,

73 }

74 transcode_formates = []

75 if file_extension not in ("oga", "ogg"):

76 transcode_formates.append("ogg")

77 if file_extension != "mp3": 77 ↛ 79line 77 didn't jump to line 79 because the condition on line 77 was always true

78 transcode_formates.append("mp3")

79 for file_suffix in transcode_formates:

80 audio_dict[f"{file_suffix}_url"] = create_transcode_url(

81 filename_without_prefix.replace(" ", "_"), file_suffix

82 )

83 return audio_dict

86def create_transcode_url(filename: str, transcode_suffix: str) -> str:

87 # Chinese Wiktionary template might expands filename that has the a lower

88 # first letter but the actual Wikimedia Commons file's first letter is

89 # capitalized

90 filename = filename[0].upper() + filename[1:]

91 md5 = hashlib.md5(filename.encode()).hexdigest()

92 return (

93 "https://upload.wikimedia.org/wikipedia/commons/transcoded/"

94 + f"{md5[0]}/{md5[:2]}/{filename}/{filename}.{transcode_suffix}"

95 )

98def set_sound_file_url_fields(wxr, filename, pydantic_model):

99 file_data = create_audio_url_dict(filename)

100 for key, value in file_data.items():

101 if hasattr(pydantic_model, key): 101 ↛ 104line 101 didn't jump to line 104 because the condition on line 101 was always true

102 setattr(pydantic_model, key, value)

103 else:

104 wxr.wtp.warning(

105 f"{key=} not defined in Sound",

106 sortid="extractor.share.set_sound_file_url_fields",

107 )

108

109

110def split_senseids(senseids_str: str) -> list[str]:

111 senseids = []

112 raw_ids = (

113 senseids_str.strip().removeprefix("[").removesuffix("]").split(",")

114 )

115 for raw_id in raw_ids:

116 range_split = raw_id.split("-")

117 if len(range_split) == 1:

118 senseids.append(raw_id.strip())

119 elif len(range_split) == 2: 119 ↛ 115line 119 didn't jump to line 115 because the condition on line 119 was always true

120 try:

121 start = re.sub(r"[a-z]", "", range_split[0].strip())

122 end = re.sub(r"[a-z]", "", range_split[1].strip())

123 senseids.extend(

124 [

125 str(id)

126 for id in range(

127 int(start),

128 int(end) + 1,

129 )

130 ]

131 )

132 except Exception:

133 pass

134

135 return senseids

136

137

138def calculate_bold_offsets(

139 wxr,

140 node: WikiNode,

141 node_text: str,

142 example,

143 field: str,

144 extra_node_kind: NodeKind | None = None,

145) -> None:

146 from ..page import clean_node

147

148 offsets = []

149 bold_words = set()

150 for b_tag in node.find_html_recursively("b"):

151 bold_words.add(clean_node(wxr, None, b_tag))

152 for strong_tag in node.find_html_recursively("strong"):

153 bold_words.add(clean_node(wxr, None, strong_tag))

154 for bold_node in node.find_child_recursively(

155 NodeKind.BOLD

156 if extra_node_kind is None

157 else NodeKind.BOLD | extra_node_kind

158 ):

159 bold_words.add(clean_node(wxr, None, bold_node))

160 for link_node in node.find_child_recursively(NodeKind.LINK):

161 if len(link_node.largs) > 0: 161 ↛ 160line 161 didn't jump to line 160 because the condition on line 161 was always true

162 link_dest = clean_node(wxr, None, link_node.largs[0])

163 if "#" in link_dest and not link_dest.startswith("#"):

164 link_dest = link_dest[:link_dest.index("#")]

165 if link_dest == wxr.wtp.title:

166 link_text = clean_node(wxr, None, link_node)

167 bold_words.add(link_text)

168

169 for bold_word in bold_words:

170 for m in re.finditer(re.escape(bold_word), node_text):

171 offsets.append((m.start(), m.end()))

172 if len(offsets) > 0:

173 if hasattr(example, field): # pydantic model

174 setattr(example, field, sorted(offsets))

175 elif isinstance(example, dict): 175 ↛ exitline 175 didn't return from function 'calculate_bold_offsets' because the condition on line 175 was always true

176 example[field] = sorted(offsets)