Coverage for src/wiktextract/extractor/share.py: 78%
73 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import hashlib
2import re
3from html import unescape
4from typing import Iterable, Optional, Union
6from wikitextprocessor import WikiNode
9def strip_nodes(
10 nodes: list[Union[WikiNode, str]]
11) -> Iterable[Union[WikiNode, str]]:
12 # filter nodes that only have newlines, white spaces and non-breaking spaces
13 return filter(
14 lambda node: isinstance(node, WikiNode)
15 or (isinstance(node, str) and len(unescape(node).strip()) > 0),
16 nodes,
17 )
20def capture_text_in_parentheses(text: str) -> tuple[list[str], str]:
21 """
22 Return a list of text inside parentheses, and the rest test.
23 """
24 rest_parts = []
25 capture_text_list = []
26 last_group_end = 0
27 for m in re.finditer(r"\([^()]+\)", text):
28 not_captured = text[last_group_end : m.start()].strip()
29 if len(not_captured) > 0: 29 ↛ 30line 29 didn't jump to line 30 because the condition on line 29 was never true
30 rest_parts.append(not_captured)
31 last_group_end = m.end()
32 text = m.group()[1:-1].strip()
33 if len(text) > 0:
34 capture_text_list.append(text)
36 rest_text = " ".join(rest_parts) if len(rest_parts) > 0 else text
37 return capture_text_list, rest_text
40def split_chinese_variants(text: str) -> Iterable[tuple[Optional[str], str]]:
41 """
42 Return Chinese character variant and text
43 """
44 if "/" in text:
45 splite_result = text.split("/")
46 if len(splite_result) != 2:
47 yield None, text
48 else:
49 for variant_index, variant in enumerate(splite_result):
50 yield "zh-Hant" if variant_index == 0 else "zh-Hans", variant
51 else:
52 yield None, text
55def create_audio_url_dict(filename: str) -> dict[str, str]:
56 # remove white space and left-to-right mark
57 filename = filename.strip(" \u200e")
58 file_extension = filename[filename.rfind(".") + 1 :].lower()
59 if file_extension == "ogv":
60 # ".ogv" pages are redirected to ".oga" pages in Wikipedia Commons
61 filename = filename[:filename.rfind(".")] + ".oga"
62 file_extension = "oga"
63 file_url_key = file_extension + "_url"
64 filename_without_prefix = filename.removeprefix("File:")
65 if len(filename_without_prefix) == 0: 65 ↛ 66line 65 didn't jump to line 66 because the condition on line 65 was never true
66 return {}
67 audio_dict = {
68 "audio": filename_without_prefix,
69 file_url_key: "https://commons.wikimedia.org/wiki/Special:FilePath/"
70 + filename_without_prefix,
71 }
72 transcode_formates = []
73 if file_extension not in ("oga", "ogg"):
74 transcode_formates.append("ogg")
75 if file_extension != "mp3": 75 ↛ 77line 75 didn't jump to line 77 because the condition on line 75 was always true
76 transcode_formates.append("mp3")
77 for file_suffix in transcode_formates:
78 audio_dict[f"{file_suffix}_url"] = create_transcode_url(
79 filename_without_prefix.replace(" ", "_"), file_suffix
80 )
81 return audio_dict
84def create_transcode_url(filename: str, transcode_suffix: str) -> str:
85 # Chinese Wiktionary template might expands filename that has the a lower
86 # first letter but the actual Wikimedia Commons file's first letter is
87 # capitalized
88 filename = filename[0].upper() + filename[1:]
89 md5 = hashlib.md5(filename.encode()).hexdigest()
90 return (
91 "https://upload.wikimedia.org/wikipedia/commons/transcoded/"
92 + f"{md5[0]}/{md5[:2]}/{filename}/{filename}.{transcode_suffix}"
93 )
96def set_sound_file_url_fields(wxr, filename, pydantic_model):
97 file_data = create_audio_url_dict(filename)
98 for key, value in file_data.items():
99 if hasattr(pydantic_model, key): 99 ↛ 102line 99 didn't jump to line 102 because the condition on line 99 was always true
100 setattr(pydantic_model, key, value)
101 else:
102 wxr.wtp.warning(
103 f"{key=} not defined in Sound",
104 sortid="extractor.share.set_sound_file_url_fields",
105 )
108def split_senseids(senseids_str: str) -> list[str]:
109 senseids = []
110 raw_ids = (
111 senseids_str.strip().removeprefix("[").removesuffix("]").split(",")
112 )
113 for raw_id in raw_ids:
114 range_split = raw_id.split("-")
115 if len(range_split) == 1:
116 senseids.append(raw_id.strip())
117 elif len(range_split) == 2: 117 ↛ 113line 117 didn't jump to line 113 because the condition on line 117 was always true
118 try:
119 start = re.sub(r"[a-z]", "", range_split[0].strip())
120 end = re.sub(r"[a-z]", "", range_split[1].strip())
121 senseids.extend(
122 [
123 str(id)
124 for id in range(
125 int(start),
126 int(end) + 1,
127 )
128 ]
129 )
130 except:
131 pass
133 return senseids