Coverage for src/wiktextract/extractor/de/example.py: 85%

80 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-17 08:19 +0000

1import re 

2 

3from wikitextprocessor import LevelNode, NodeKind, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from ..share import calculate_bold_offsets 

8from .models import Example, Sense, WordEntry 

9from .tags import translate_raw_tags 

10from .utils import extract_sense_index 

11 

12LITERATUR_TEMPLATE_ARGS = { 

13 "autor": "author", 

14 "a": "author", 

15 "titel": "title", 

16 "titelerg": "title_complement", 

17 "auflage": "edition", 

18 "verlag": "publisher", 

19 "ort": "place", 

20 "jahr": "year", 

21 "seiten": "pages", 

22 "isbn": "isbn", 

23 "übersetzer": "translator", 

24 "herausgeber": "editor", 

25 "sammelwerk": "collection", 

26 "werk": "collection", 

27 "band": "volume", 

28 "kommentar": "comment", 

29 "online": "url", 

30 "tag": "day", 

31 "monat": "month", 

32 "zugriff": "accessdate", 

33 "nummer": "number", 

34 "datum": "date", 

35 "hrsg": "editor", 

36} 

37 

38 

39def extract_examples( 

40 wxr: WiktextractContext, 

41 word_entry: WordEntry, 

42 level_node: LevelNode, 

43) -> None: 

44 last_example = None 

45 raw_tags = [] 

46 for list_item_node in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

47 if not list_item_node.sarg.endswith(":"): 

48 raw_tags.clear() 

49 raw_tag = clean_node(wxr, None, list_item_node.children) 

50 raw_tag = raw_tag.strip(": ") 

51 if raw_tag != "": 51 ↛ 46line 51 didn't jump to line 46 because the condition on line 51 was always true

52 raw_tags.append(raw_tag) 

53 else: 

54 example_data = Example(raw_tags=raw_tags) 

55 for ref_tag in list_item_node.find_html("ref"): 

56 extract_reference(wxr, example_data, ref_tag) 

57 example_text_node = wxr.wtp.parse( 

58 wxr.wtp.node_to_wikitext( 

59 list( 

60 list_item_node.invert_find_child( 

61 NodeKind.LIST, include_empty_str=True 

62 ) 

63 ) 

64 ) 

65 ) 

66 example_text = clean_node(wxr, None, example_text_node) 

67 sense_idx, example_text = extract_sense_index(example_text) 

68 if len(example_text) > 0: 

69 translate_raw_tags(example_data) 

70 example_data.text = example_text 

71 calculate_bold_offsets( 

72 wxr, 

73 example_text_node, 

74 example_text, 

75 example_data, 

76 "italic_text_offsets", 

77 extra_node_kind=NodeKind.ITALIC, 

78 ) 

79 if len(sense_idx) > 0: 

80 find_sense = False 

81 for sense in word_entry.senses: 

82 if match_sense_index(sense_idx, sense): 

83 sense.examples.append(example_data) 

84 find_sense = True 

85 if not find_sense: 

86 new_sense = Sense( 

87 sense_index=sense_idx, tags=["no-gloss"] 

88 ) 

89 new_sense.examples.append(example_data) 

90 word_entry.senses.append(new_sense) 

91 last_example = example_data 

92 elif last_example is not None: 92 ↛ 103line 92 didn't jump to line 103 because the condition on line 92 was always true

93 last_example.translation = example_text 

94 calculate_bold_offsets( 

95 wxr, 

96 example_text_node, 

97 example_text, 

98 example_data, 

99 "italic_translation_offsets", 

100 extra_node_kind=NodeKind.ITALIC, 

101 ) 

102 else: 

103 wxr.wtp.debug( 

104 f"Found example data without senseid: {example_data}", 

105 sortid="extractor/de/examples/extract_examples/28", 

106 ) 

107 last_example = None 

108 

109 for non_list_node in level_node.invert_find_child(NodeKind.LIST): 109 ↛ 110line 109 didn't jump to line 110 because the loop on line 109 never started

110 wxr.wtp.debug( 

111 f"Found unexpected non-list node in examples: {non_list_node}", 

112 sortid="extractor/de/examples/extract_examples/33", 

113 ) 

114 

115 

116def extract_reference( 

117 wxr: WiktextractContext, example_data: Example, ref_node: WikiNode 

118): 

119 example_data.ref = clean_node(wxr, None, ref_node.children) 

120 for template_node in ref_node.find_child(NodeKind.TEMPLATE): 

121 if template_node.template_name == "Literatur": 

122 # https://de.wiktionary.org/wiki/Vorlage:Literatur 

123 for key, value in template_node.template_parameters.items(): 

124 if not isinstance(key, str): 

125 continue 

126 if key.lower() in LITERATUR_TEMPLATE_ARGS: 126 ↛ 132line 126 didn't jump to line 132 because the condition on line 126 was always true

127 field = LITERATUR_TEMPLATE_ARGS[key.lower()] 

128 if hasattr(example_data, field): 128 ↛ 123line 128 didn't jump to line 123 because the condition on line 128 was always true

129 setattr( 

130 example_data, field, clean_node(wxr, None, value) 

131 ) 

132 elif isinstance(key, str): 

133 wxr.wtp.debug( 

134 f"Unexpected key in Literatur template: {key}", 

135 sortid="extractor/de/examples/extract_examples/77", 

136 ) 

137 

138 # XXX: Treat other templates as well. 

139 # E.g. https://de.wiktionary.org/wiki/Vorlage:Ref-OWID 

140 

141 

142def match_sense_index(sense_idx: str, sense: Sense) -> bool: 

143 exact_match = not ( 

144 "," in sense_idx or "-" in sense_idx or "." not in sense_idx 

145 ) 

146 if exact_match: 146 ↛ 147line 146 didn't jump to line 147 because the condition on line 146 was never true

147 return sense_idx == sense.sense_index 

148 

149 if sense_idx == sense.sense_index: 

150 return True 

151 first_number_str = re.split(r",|\.|-|–", sense.sense_index, maxsplit=1)[0] 

152 first_number = 0 

153 if first_number_str.isdigit(): 

154 first_number = int(first_number_str) 

155 else: 

156 return False 

157 

158 for try_idx in sense_idx.split(","): 

159 try_idx = try_idx.strip() 

160 if try_idx == sense.sense_index: 

161 return True 

162 elif re.fullmatch(r"\d+[\-–]\d+", try_idx): 162 ↛ 163line 162 didn't jump to line 163 because the condition on line 162 was never true

163 start_str, end_str = re.split(r"-|–", try_idx, maxsplit=1) 

164 if int(start_str) <= first_number and first_number <= int(end_str): 

165 return True 

166 

167 return False