Coverage for src/wiktextract/extractor/de/example.py: 84%

77 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1import re 

2 

3from wikitextprocessor import LevelNode, NodeKind, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .models import Example, Sense, WordEntry 

8from .tags import translate_raw_tags 

9from .utils import extract_sense_index 

10 

11LITERATUR_TEMPLATE_ARGS = { 

12 "autor": "author", 

13 "a": "author", 

14 "titel": "title", 

15 "titelerg": "title_complement", 

16 "auflage": "edition", 

17 "verlag": "publisher", 

18 "ort": "place", 

19 "jahr": "year", 

20 "seiten": "pages", 

21 "isbn": "isbn", 

22 "übersetzer": "translator", 

23 "herausgeber": "editor", 

24 "sammelwerk": "collection", 

25 "werk": "collection", 

26 "band": "volume", 

27 "kommentar": "comment", 

28 "online": "url", 

29 "tag": "day", 

30 "monat": "month", 

31 "zugriff": "accessdate", 

32 "nummer": "number", 

33 "datum": "date", 

34 "hrsg": "editor", 

35} 

36 

37 

38def extract_examples( 

39 wxr: WiktextractContext, 

40 page_data: list[WordEntry], 

41 level_node: LevelNode, 

42) -> None: 

43 last_example = None 

44 raw_tags = [] 

45 for list_item_node in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

46 if not list_item_node.sarg.endswith(":"): 

47 raw_tags.clear() 

48 raw_tag = clean_node(wxr, None, list_item_node.children) 

49 raw_tag = raw_tag.strip(": ") 

50 if raw_tag != "": 50 ↛ 45line 50 didn't jump to line 45 because the condition on line 50 was always true

51 raw_tags.append(raw_tag) 

52 else: 

53 example_data = Example(raw_tags=raw_tags) 

54 for ref_tag in list_item_node.find_html("ref"): 

55 extract_reference(wxr, example_data, ref_tag) 

56 example_text = clean_node( 

57 wxr, None, list(list_item_node.invert_find_child(NodeKind.LIST)) 

58 ) 

59 sense_idx, example_text = extract_sense_index(example_text) 

60 if len(example_text) > 0: 

61 translate_raw_tags(example_data) 

62 example_data.text = example_text 

63 if len(sense_idx) > 0: 

64 find_sense = False 

65 for word_entry in page_data: 

66 for sense in word_entry.senses: 

67 if match_sense_index(sense_idx, sense): 

68 sense.examples.append(example_data) 

69 find_sense = True 

70 if not find_sense: 

71 new_sense = Sense( 

72 sense_index=sense_idx, tags=["no-gloss"] 

73 ) 

74 new_sense.examples.append(example_data) 

75 word_entry.senses.append(new_sense) 

76 last_example = example_data 

77 elif last_example is not None: 77 ↛ 80line 77 didn't jump to line 80 because the condition on line 77 was always true

78 last_example.translation = example_text 

79 else: 

80 wxr.wtp.debug( 

81 f"Found example data without senseid: {example_data}", 

82 sortid="extractor/de/examples/extract_examples/28", 

83 ) 

84 last_example = None 

85 

86 for non_list_node in level_node.invert_find_child(NodeKind.LIST): 86 ↛ 87line 86 didn't jump to line 87 because the loop on line 86 never started

87 wxr.wtp.debug( 

88 f"Found unexpected non-list node in examples: {non_list_node}", 

89 sortid="extractor/de/examples/extract_examples/33", 

90 ) 

91 

92 

93def extract_reference( 

94 wxr: WiktextractContext, example_data: Example, ref_node: WikiNode 

95): 

96 example_data.ref = clean_node(wxr, None, ref_node.children) 

97 for template_node in ref_node.find_child(NodeKind.TEMPLATE): 

98 if template_node.template_name == "Literatur": 

99 # https://de.wiktionary.org/wiki/Vorlage:Literatur 

100 for key, value in template_node.template_parameters.items(): 

101 if not isinstance(key, str): 

102 continue 

103 if key.lower() in LITERATUR_TEMPLATE_ARGS: 103 ↛ 109line 103 didn't jump to line 109 because the condition on line 103 was always true

104 field = LITERATUR_TEMPLATE_ARGS[key.lower()] 

105 if field in example_data.model_fields: 105 ↛ 100line 105 didn't jump to line 100 because the condition on line 105 was always true

106 setattr( 

107 example_data, field, clean_node(wxr, None, value) 

108 ) 

109 elif isinstance(key, str): 

110 wxr.wtp.debug( 

111 f"Unexpected key in Literatur template: {key}", 

112 sortid="extractor/de/examples/extract_examples/77", 

113 ) 

114 

115 # XXX: Treat other templates as well. 

116 # E.g. https://de.wiktionary.org/wiki/Vorlage:Ref-OWID 

117 

118 

119def match_sense_index(sense_idx: str, sense: Sense) -> bool: 

120 exact_match = not ( 

121 "," in sense_idx or "-" in sense_idx or "." not in sense_idx 

122 ) 

123 if exact_match: 123 ↛ 124line 123 didn't jump to line 124 because the condition on line 123 was never true

124 return sense_idx == sense.sense_index 

125 

126 if sense_idx == sense.sense_index: 

127 return True 

128 first_number_str = re.split(r",|\.|-", sense.sense_index, 1)[0] 

129 first_number = 0 

130 if first_number_str.isdigit(): 

131 first_number = int(first_number_str) 

132 else: 

133 return False 

134 

135 for try_idx in sense_idx.split(","): 

136 try_idx = try_idx.strip() 

137 if try_idx == sense.sense_index: 

138 return True 

139 elif re.fullmatch(r"\d+-\d+", try_idx): 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true

140 start_str, end_str = try_idx.split("-") 

141 if int(start_str) <= first_number and first_number <= int(end_str): 

142 return True 

143 

144 return False