Coverage for src/wiktextract/extractor/de/example.py: 85%

1import re

3from wikitextprocessor import LevelNode, NodeKind, WikiNode

5from ...page import clean_node

6from ...wxr_context import WiktextractContext

7from ..share import calculate_bold_offsets

8from .models import Example, Sense, WordEntry

9from .tags import translate_raw_tags

10from .utils import extract_sense_index

12LITERATUR_TEMPLATE_ARGS = {

13 "autor": "author",

14 "a": "author",

15 "titel": "title",

16 "titelerg": "title_complement",

17 "auflage": "edition",

18 "verlag": "publisher",

19 "ort": "place",

20 "jahr": "year",

21 "seiten": "pages",

22 "isbn": "isbn",

23 "übersetzer": "translator",

24 "herausgeber": "editor",

25 "sammelwerk": "collection",

26 "werk": "collection",

27 "band": "volume",

28 "kommentar": "comment",

29 "online": "url",

30 "tag": "day",

31 "monat": "month",

32 "zugriff": "accessdate",

33 "nummer": "number",

34 "datum": "date",

35 "hrsg": "editor",

36}

39def extract_examples(

40 wxr: WiktextractContext,

41 word_entry: WordEntry,

42 level_node: LevelNode,

43) -> None:

44 last_example = None

45 raw_tags = []

46 for list_item_node in level_node.find_child_recursively(NodeKind.LIST_ITEM):

47 if not list_item_node.sarg.endswith(":"):

48 raw_tags.clear()

49 raw_tag = clean_node(wxr, None, list_item_node.children)

50 raw_tag = raw_tag.strip(": ")

51 if raw_tag != "": 51 ↛ 46line 51 didn't jump to line 46 because the condition on line 51 was always true

52 raw_tags.append(raw_tag)

53 else:

54 example_data = Example(raw_tags=raw_tags)

55 for ref_tag in list_item_node.find_html("ref"):

56 extract_reference(wxr, example_data, ref_tag)

57 example_text_node = wxr.wtp.parse(

58 wxr.wtp.node_to_wikitext(

59 list(list_item_node.invert_find_child(NodeKind.LIST))

60 )

61 )

62 example_text = clean_node(wxr, None, example_text_node)

63 sense_idx, example_text = extract_sense_index(example_text)

64 if len(example_text) > 0:

65 translate_raw_tags(example_data)

66 example_data.text = example_text

67 calculate_bold_offsets(

68 wxr,

69 example_text_node,

70 example_text,

71 example_data,

72 "italic_text_offsets",

73 extra_node_kind=NodeKind.ITALIC,

74 )

75 if len(sense_idx) > 0:

76 find_sense = False

77 for sense in word_entry.senses:

78 if match_sense_index(sense_idx, sense):

79 sense.examples.append(example_data)

80 find_sense = True

81 if not find_sense:

82 new_sense = Sense(

83 sense_index=sense_idx, tags=["no-gloss"]

84 )

85 new_sense.examples.append(example_data)

86 word_entry.senses.append(new_sense)

87 last_example = example_data

88 elif last_example is not None: 88 ↛ 99line 88 didn't jump to line 99 because the condition on line 88 was always true

89 last_example.translation = example_text

90 calculate_bold_offsets(

91 wxr,

92 example_text_node,

93 example_text,

94 example_data,

95 "italic_translation_offsets",

96 extra_node_kind=NodeKind.ITALIC,

97 )

98 else:

99 wxr.wtp.debug(

100 f"Found example data without senseid: {example_data}",

101 sortid="extractor/de/examples/extract_examples/28",

102 )

103 last_example = None

104

105 for non_list_node in level_node.invert_find_child(NodeKind.LIST): 105 ↛ 106line 105 didn't jump to line 106 because the loop on line 105 never started

106 wxr.wtp.debug(

107 f"Found unexpected non-list node in examples: {non_list_node}",

108 sortid="extractor/de/examples/extract_examples/33",

109 )

110

111

112def extract_reference(

113 wxr: WiktextractContext, example_data: Example, ref_node: WikiNode

114):

115 example_data.ref = clean_node(wxr, None, ref_node.children)

116 for template_node in ref_node.find_child(NodeKind.TEMPLATE):

117 if template_node.template_name == "Literatur":

118 # https://de.wiktionary.org/wiki/Vorlage:Literatur

119 for key, value in template_node.template_parameters.items():

120 if not isinstance(key, str):

121 continue

122 if key.lower() in LITERATUR_TEMPLATE_ARGS: 122 ↛ 128line 122 didn't jump to line 128 because the condition on line 122 was always true

123 field = LITERATUR_TEMPLATE_ARGS[key.lower()]

124 if hasattr(example_data, field): 124 ↛ 119line 124 didn't jump to line 119 because the condition on line 124 was always true

125 setattr(

126 example_data, field, clean_node(wxr, None, value)

127 )

128 elif isinstance(key, str):

129 wxr.wtp.debug(

130 f"Unexpected key in Literatur template: {key}",

131 sortid="extractor/de/examples/extract_examples/77",

132 )

133

134 # XXX: Treat other templates as well.

135 # E.g. https://de.wiktionary.org/wiki/Vorlage:Ref-OWID

136

137

138def match_sense_index(sense_idx: str, sense: Sense) -> bool:

139 exact_match = not (

140 "," in sense_idx or "-" in sense_idx or "." not in sense_idx

141 )

142 if exact_match: 142 ↛ 143line 142 didn't jump to line 143 because the condition on line 142 was never true

143 return sense_idx == sense.sense_index

144

145 if sense_idx == sense.sense_index:

146 return True

147 first_number_str = re.split(r",|\.|-|–", sense.sense_index, maxsplit=1)[0]

148 first_number = 0

149 if first_number_str.isdigit():

150 first_number = int(first_number_str)

151 else:

152 return False

153

154 for try_idx in sense_idx.split(","):

155 try_idx = try_idx.strip()

156 if try_idx == sense.sense_index:

157 return True

158 elif re.fullmatch(r"\d+[\-–]\d+", try_idx): 158 ↛ 159line 158 didn't jump to line 159 because the condition on line 158 was never true

159 start_str, end_str = re.split(r"-|–", try_idx, maxsplit=1)

160 if int(start_str) <= first_number and first_number <= int(end_str):

161 return True

162

163 return False