Coverage for src / wiktextract / extractor / share.py: 86%
103 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-11 04:48 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-11 04:48 +0000
1import hashlib
2import re
3from html import unescape
4from typing import Iterable, Optional, Union
6from wikitextprocessor import NodeKind, WikiNode
8from wiktextract.wxr_context import WiktextractContext
10from ..page import clean_node
13def strip_nodes(
14 nodes: list[Union[WikiNode, str]],
15) -> Iterable[Union[WikiNode, str]]:
16 # filter nodes that only have newlines, white spaces and non-breaking spaces
17 return filter(
18 lambda node: isinstance(node, WikiNode)
19 or (isinstance(node, str) and len(unescape(node).strip()) > 0),
20 nodes,
21 )
24def capture_text_in_parentheses(text: str) -> tuple[list[str], str]:
25 """
26 Return a list of text inside parentheses, and the rest test.
27 """
28 rest_parts = []
29 capture_text_list = []
30 last_group_end = 0
31 for m in re.finditer(r"\([^()]+\)", text):
32 not_captured = text[last_group_end : m.start()].strip()
33 if len(not_captured) > 0:
34 rest_parts.append(not_captured)
35 last_group_end = m.end()
36 group_text = m.group()[1:-1].strip()
37 if len(group_text) > 0:
38 capture_text_list.append(group_text)
39 not_captured = text[last_group_end:].strip()
40 if len(not_captured) > 0:
41 rest_parts.append(not_captured)
42 rest_text = " ".join(rest_parts) if len(rest_parts) > 0 else text
43 return capture_text_list, rest_text
46def split_chinese_variants(text: str) -> Iterable[tuple[Optional[str], str]]:
47 """
48 Return Chinese character variant and text
49 """
50 if "/" in text:
51 splite_result = text.split("/")
52 if len(splite_result) != 2:
53 yield None, text
54 else:
55 for variant_index, variant in enumerate(splite_result):
56 yield "zh-Hant" if variant_index == 0 else "zh-Hans", variant
57 else:
58 yield None, text
61def create_audio_url_dict(filename: str) -> dict[str, str]:
62 # remove white space and left-to-right mark
63 filename = filename.strip(" \u200e")
64 file_extension = filename[filename.rfind(".") + 1 :].lower()
65 if file_extension == "ogv":
66 # ".ogv" pages are redirected to ".oga" pages in Wikipedia Commons
67 filename = filename[: filename.rfind(".")] + ".oga"
68 file_extension = "oga"
69 file_url_key = file_extension + "_url"
70 filename_without_prefix = filename.removeprefix("File:")
71 if len(filename_without_prefix) == 0: 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true
72 return {}
73 audio_dict = {
74 "audio": filename_without_prefix,
75 file_url_key: "https://commons.wikimedia.org/wiki/Special:FilePath/"
76 + filename_without_prefix,
77 }
78 transcode_formates = []
79 if file_extension not in ("oga", "ogg"):
80 transcode_formates.append("ogg")
81 if file_extension != "mp3": 81 ↛ 83line 81 didn't jump to line 83 because the condition on line 81 was always true
82 transcode_formates.append("mp3")
83 for file_suffix in transcode_formates:
84 audio_dict[f"{file_suffix}_url"] = create_transcode_url(
85 filename_without_prefix.replace(" ", "_"), file_suffix
86 )
87 return audio_dict
90def create_transcode_url(filename: str, transcode_suffix: str) -> str:
91 # Chinese Wiktionary template might expands filename that has the a lower
92 # first letter but the actual Wikimedia Commons file's first letter is
93 # capitalized
94 filename = filename[0].upper() + filename[1:]
95 md5 = hashlib.md5(filename.encode()).hexdigest()
96 return (
97 "https://upload.wikimedia.org/wikipedia/commons/transcoded/"
98 + f"{md5[0]}/{md5[:2]}/{filename}/{filename}.{transcode_suffix}"
99 )
102def set_sound_file_url_fields(
103 wxr: WiktextractContext, filename, pydantic_model
104):
105 file_data = create_audio_url_dict(filename)
106 for key, value in file_data.items():
107 if hasattr(pydantic_model, key): 107 ↛ 110line 107 didn't jump to line 110 because the condition on line 107 was always true
108 setattr(pydantic_model, key, value)
109 else:
110 wxr.wtp.warning(
111 f"{key=} not defined in Sound",
112 sortid="extractor.share.set_sound_file_url_fields",
113 )
116def split_senseids(senseids_str: str) -> list[str]:
117 senseids = []
118 raw_ids = (
119 senseids_str.strip().removeprefix("[").removesuffix("]").split(",")
120 )
121 for raw_id in raw_ids:
122 range_split = raw_id.split("-")
123 if len(range_split) == 1:
124 senseids.append(raw_id.strip())
125 elif len(range_split) == 2: 125 ↛ 121line 125 didn't jump to line 121 because the condition on line 125 was always true
126 try:
127 start = re.sub(r"[a-z]", "", range_split[0].strip())
128 end = re.sub(r"[a-z]", "", range_split[1].strip())
129 senseids.extend(
130 [
131 str(id)
132 for id in range(
133 int(start),
134 int(end) + 1,
135 )
136 ]
137 )
138 except Exception:
139 pass
141 return senseids
144def calculate_bold_offsets(
145 wxr: WiktextractContext,
146 node: WikiNode,
147 node_text: str,
148 example,
149 field: str,
150 extra_node_kind: NodeKind | None = None,
151) -> None:
152 offsets = []
153 bold_words = set()
154 for b_tag in node.find_html_recursively("b"):
155 bold_words.add(clean_node(wxr, None, b_tag))
156 for strong_tag in node.find_html_recursively("strong"):
157 bold_words.add(clean_node(wxr, None, strong_tag))
158 for bold_node in node.find_child_recursively(
159 NodeKind.BOLD
160 if extra_node_kind is None
161 else NodeKind.BOLD | extra_node_kind
162 ):
163 bold_words.add(clean_node(wxr, None, bold_node))
164 for link_node in node.find_child_recursively(NodeKind.LINK):
165 if len(link_node.largs) > 0: 165 ↛ 164line 165 didn't jump to line 164 because the condition on line 165 was always true
166 link_dest = clean_node(wxr, None, link_node.largs[0])
167 if "#" in link_dest and not link_dest.startswith("#"):
168 link_dest = link_dest[: link_dest.index("#")]
169 if link_dest == wxr.wtp.title:
170 link_text = clean_node(wxr, None, link_node)
171 bold_words.add(link_text)
173 for bold_word in bold_words:
174 for m in re.finditer(re.escape(bold_word), node_text):
175 offsets.append((m.start(), m.end()))
176 if len(offsets) > 0:
177 if hasattr(example, field): # pydantic model
178 setattr(example, field, sorted(offsets))
179 elif isinstance(example, dict): 179 ↛ exitline 179 didn't return from function 'calculate_bold_offsets' because the condition on line 179 was always true
180 example[field] = sorted(offsets)