Coverage for src/wiktextract/extractor/share.py: 78%

73 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1import hashlib 

2import re 

3from html import unescape 

4from typing import Iterable, Optional, Union 

5 

6from wikitextprocessor import WikiNode 

7 

8 

9def strip_nodes( 

10 nodes: list[Union[WikiNode, str]] 

11) -> Iterable[Union[WikiNode, str]]: 

12 # filter nodes that only have newlines, white spaces and non-breaking spaces 

13 return filter( 

14 lambda node: isinstance(node, WikiNode) 

15 or (isinstance(node, str) and len(unescape(node).strip()) > 0), 

16 nodes, 

17 ) 

18 

19 

20def capture_text_in_parentheses(text: str) -> tuple[list[str], str]: 

21 """ 

22 Return a list of text inside parentheses, and the rest test. 

23 """ 

24 rest_parts = [] 

25 capture_text_list = [] 

26 last_group_end = 0 

27 for m in re.finditer(r"\([^()]+\)", text): 

28 not_captured = text[last_group_end : m.start()].strip() 

29 if len(not_captured) > 0: 29 ↛ 30line 29 didn't jump to line 30 because the condition on line 29 was never true

30 rest_parts.append(not_captured) 

31 last_group_end = m.end() 

32 text = m.group()[1:-1].strip() 

33 if len(text) > 0: 

34 capture_text_list.append(text) 

35 

36 rest_text = " ".join(rest_parts) if len(rest_parts) > 0 else text 

37 return capture_text_list, rest_text 

38 

39 

40def split_chinese_variants(text: str) -> Iterable[tuple[Optional[str], str]]: 

41 """ 

42 Return Chinese character variant and text 

43 """ 

44 if "/" in text: 

45 splite_result = text.split("/") 

46 if len(splite_result) != 2: 

47 yield None, text 

48 else: 

49 for variant_index, variant in enumerate(splite_result): 

50 yield "zh-Hant" if variant_index == 0 else "zh-Hans", variant 

51 else: 

52 yield None, text 

53 

54 

55def create_audio_url_dict(filename: str) -> dict[str, str]: 

56 # remove white space and left-to-right mark 

57 filename = filename.strip(" \u200e") 

58 file_extension = filename[filename.rfind(".") + 1 :].lower() 

59 if file_extension == "ogv": 

60 # ".ogv" pages are redirected to ".oga" pages in Wikipedia Commons 

61 filename = filename[:filename.rfind(".")] + ".oga" 

62 file_extension = "oga" 

63 file_url_key = file_extension + "_url" 

64 filename_without_prefix = filename.removeprefix("File:") 

65 if len(filename_without_prefix) == 0: 65 ↛ 66line 65 didn't jump to line 66 because the condition on line 65 was never true

66 return {} 

67 audio_dict = { 

68 "audio": filename_without_prefix, 

69 file_url_key: "https://commons.wikimedia.org/wiki/Special:FilePath/" 

70 + filename_without_prefix, 

71 } 

72 transcode_formates = [] 

73 if file_extension not in ("oga", "ogg"): 

74 transcode_formates.append("ogg") 

75 if file_extension != "mp3": 75 ↛ 77line 75 didn't jump to line 77 because the condition on line 75 was always true

76 transcode_formates.append("mp3") 

77 for file_suffix in transcode_formates: 

78 audio_dict[f"{file_suffix}_url"] = create_transcode_url( 

79 filename_without_prefix.replace(" ", "_"), file_suffix 

80 ) 

81 return audio_dict 

82 

83 

84def create_transcode_url(filename: str, transcode_suffix: str) -> str: 

85 # Chinese Wiktionary template might expands filename that has the a lower 

86 # first letter but the actual Wikimedia Commons file's first letter is 

87 # capitalized 

88 filename = filename[0].upper() + filename[1:] 

89 md5 = hashlib.md5(filename.encode()).hexdigest() 

90 return ( 

91 "https://upload.wikimedia.org/wikipedia/commons/transcoded/" 

92 + f"{md5[0]}/{md5[:2]}/{filename}/{filename}.{transcode_suffix}" 

93 ) 

94 

95 

96def set_sound_file_url_fields(wxr, filename, pydantic_model): 

97 file_data = create_audio_url_dict(filename) 

98 for key, value in file_data.items(): 

99 if hasattr(pydantic_model, key): 99 ↛ 102line 99 didn't jump to line 102 because the condition on line 99 was always true

100 setattr(pydantic_model, key, value) 

101 else: 

102 wxr.wtp.warning( 

103 f"{key=} not defined in Sound", 

104 sortid="extractor.share.set_sound_file_url_fields", 

105 ) 

106 

107 

108def split_senseids(senseids_str: str) -> list[str]: 

109 senseids = [] 

110 raw_ids = ( 

111 senseids_str.strip().removeprefix("[").removesuffix("]").split(",") 

112 ) 

113 for raw_id in raw_ids: 

114 range_split = raw_id.split("-") 

115 if len(range_split) == 1: 

116 senseids.append(raw_id.strip()) 

117 elif len(range_split) == 2: 117 ↛ 113line 117 didn't jump to line 113 because the condition on line 117 was always true

118 try: 

119 start = re.sub(r"[a-z]", "", range_split[0].strip()) 

120 end = re.sub(r"[a-z]", "", range_split[1].strip()) 

121 senseids.extend( 

122 [ 

123 str(id) 

124 for id in range( 

125 int(start), 

126 int(end) + 1, 

127 ) 

128 ] 

129 ) 

130 except: 

131 pass 

132 

133 return senseids