Coverage for src/wiktextract/extractor/share.py: 84%
102 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1import hashlib
2import re
3from html import unescape
4from typing import Iterable, Optional, Union
6from wikitextprocessor import NodeKind, WikiNode
9def strip_nodes(
10 nodes: list[Union[WikiNode, str]],
11) -> Iterable[Union[WikiNode, str]]:
12 # filter nodes that only have newlines, white spaces and non-breaking spaces
13 return filter(
14 lambda node: isinstance(node, WikiNode)
15 or (isinstance(node, str) and len(unescape(node).strip()) > 0),
16 nodes,
17 )
20def capture_text_in_parentheses(text: str) -> tuple[list[str], str]:
21 """
22 Return a list of text inside parentheses, and the rest test.
23 """
24 rest_parts = []
25 capture_text_list = []
26 last_group_end = 0
27 for m in re.finditer(r"\([^()]+\)", text):
28 not_captured = text[last_group_end : m.start()].strip()
29 if len(not_captured) > 0: 29 ↛ 30line 29 didn't jump to line 30 because the condition on line 29 was never true
30 rest_parts.append(not_captured)
31 last_group_end = m.end()
32 group_text = m.group()[1:-1].strip()
33 if len(group_text) > 0:
34 capture_text_list.append(group_text)
35 not_captured = text[last_group_end:].strip()
36 if len(not_captured) > 0:
37 rest_parts.append(not_captured)
38 rest_text = " ".join(rest_parts) if len(rest_parts) > 0 else text
39 return capture_text_list, rest_text
42def split_chinese_variants(text: str) -> Iterable[tuple[Optional[str], str]]:
43 """
44 Return Chinese character variant and text
45 """
46 if "/" in text:
47 splite_result = text.split("/")
48 if len(splite_result) != 2:
49 yield None, text
50 else:
51 for variant_index, variant in enumerate(splite_result):
52 yield "zh-Hant" if variant_index == 0 else "zh-Hans", variant
53 else:
54 yield None, text
57def create_audio_url_dict(filename: str) -> dict[str, str]:
58 # remove white space and left-to-right mark
59 filename = filename.strip(" \u200e")
60 file_extension = filename[filename.rfind(".") + 1 :].lower()
61 if file_extension == "ogv":
62 # ".ogv" pages are redirected to ".oga" pages in Wikipedia Commons
63 filename = filename[: filename.rfind(".")] + ".oga"
64 file_extension = "oga"
65 file_url_key = file_extension + "_url"
66 filename_without_prefix = filename.removeprefix("File:")
67 if len(filename_without_prefix) == 0: 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true
68 return {}
69 audio_dict = {
70 "audio": filename_without_prefix,
71 file_url_key: "https://commons.wikimedia.org/wiki/Special:FilePath/"
72 + filename_without_prefix,
73 }
74 transcode_formates = []
75 if file_extension not in ("oga", "ogg"):
76 transcode_formates.append("ogg")
77 if file_extension != "mp3": 77 ↛ 79line 77 didn't jump to line 79 because the condition on line 77 was always true
78 transcode_formates.append("mp3")
79 for file_suffix in transcode_formates:
80 audio_dict[f"{file_suffix}_url"] = create_transcode_url(
81 filename_without_prefix.replace(" ", "_"), file_suffix
82 )
83 return audio_dict
86def create_transcode_url(filename: str, transcode_suffix: str) -> str:
87 # Chinese Wiktionary template might expands filename that has the a lower
88 # first letter but the actual Wikimedia Commons file's first letter is
89 # capitalized
90 filename = filename[0].upper() + filename[1:]
91 md5 = hashlib.md5(filename.encode()).hexdigest()
92 return (
93 "https://upload.wikimedia.org/wikipedia/commons/transcoded/"
94 + f"{md5[0]}/{md5[:2]}/{filename}/{filename}.{transcode_suffix}"
95 )
98def set_sound_file_url_fields(wxr, filename, pydantic_model):
99 file_data = create_audio_url_dict(filename)
100 for key, value in file_data.items():
101 if hasattr(pydantic_model, key): 101 ↛ 104line 101 didn't jump to line 104 because the condition on line 101 was always true
102 setattr(pydantic_model, key, value)
103 else:
104 wxr.wtp.warning(
105 f"{key=} not defined in Sound",
106 sortid="extractor.share.set_sound_file_url_fields",
107 )
110def split_senseids(senseids_str: str) -> list[str]:
111 senseids = []
112 raw_ids = (
113 senseids_str.strip().removeprefix("[").removesuffix("]").split(",")
114 )
115 for raw_id in raw_ids:
116 range_split = raw_id.split("-")
117 if len(range_split) == 1:
118 senseids.append(raw_id.strip())
119 elif len(range_split) == 2: 119 ↛ 115line 119 didn't jump to line 115 because the condition on line 119 was always true
120 try:
121 start = re.sub(r"[a-z]", "", range_split[0].strip())
122 end = re.sub(r"[a-z]", "", range_split[1].strip())
123 senseids.extend(
124 [
125 str(id)
126 for id in range(
127 int(start),
128 int(end) + 1,
129 )
130 ]
131 )
132 except Exception:
133 pass
135 return senseids
138def calculate_bold_offsets(
139 wxr,
140 node: WikiNode,
141 node_text: str,
142 example,
143 field: str,
144 extra_node_kind: NodeKind | None = None,
145) -> None:
146 from ..page import clean_node
148 offsets = []
149 bold_words = set()
150 for b_tag in node.find_html_recursively("b"):
151 bold_words.add(clean_node(wxr, None, b_tag))
152 for strong_tag in node.find_html_recursively("strong"):
153 bold_words.add(clean_node(wxr, None, strong_tag))
154 for bold_node in node.find_child_recursively(
155 NodeKind.BOLD
156 if extra_node_kind is None
157 else NodeKind.BOLD | extra_node_kind
158 ):
159 bold_words.add(clean_node(wxr, None, bold_node))
160 for link_node in node.find_child_recursively(NodeKind.LINK):
161 if len(link_node.largs) > 0: 161 ↛ 160line 161 didn't jump to line 160 because the condition on line 161 was always true
162 link_dest = clean_node(wxr, None, link_node.largs[0])
163 if "#" in link_dest and not link_dest.startswith("#"):
164 link_dest = link_dest[:link_dest.index("#")]
165 if link_dest == wxr.wtp.title:
166 link_text = clean_node(wxr, None, link_node)
167 bold_words.add(link_text)
169 for bold_word in bold_words:
170 for m in re.finditer(re.escape(bold_word), node_text):
171 offsets.append((m.start(), m.end()))
172 if len(offsets) > 0:
173 if hasattr(example, field): # pydantic model
174 setattr(example, field, sorted(offsets))
175 elif isinstance(example, dict): 175 ↛ exitline 175 didn't return from function 'calculate_bold_offsets' because the condition on line 175 was always true
176 example[field] = sorted(offsets)