Coverage for src / wiktextract / extractor / de / example.py: 86%

68 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 08:09 +0000

1import re 

2 

3from wikitextprocessor import LevelNode, NodeKind, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from ..share import calculate_bold_offsets 

8from .models import Example, Sense, WordEntry 

9from .tags import translate_raw_tags 

10from .utils import extract_sense_index 

11 

12 

13def extract_examples( 

14 wxr: WiktextractContext, 

15 word_entry: WordEntry, 

16 level_node: LevelNode, 

17) -> None: 

18 last_example = None 

19 raw_tags = [] 

20 for list_item_node in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

21 if not list_item_node.sarg.endswith(":"): 

22 raw_tags.clear() 

23 raw_tag = clean_node(wxr, None, list_item_node.children) 

24 raw_tag = raw_tag.strip(": ") 

25 if raw_tag != "": 25 ↛ 20line 25 didn't jump to line 20 because the condition on line 25 was always true

26 raw_tags.append(raw_tag) 

27 else: 

28 example_data = Example(text="", raw_tags=raw_tags) 

29 for ref_tag in list_item_node.find_html("ref"): 

30 example_data.ref = clean_node(wxr, None, ref_tag.children) 

31 example_text_node = wxr.wtp.parse( 

32 wxr.wtp.node_to_wikitext( 

33 list( 

34 list_item_node.invert_find_child( 

35 NodeKind.LIST, include_empty_str=True 

36 ) 

37 ) 

38 ) 

39 ) 

40 example_text = clean_node(wxr, None, example_text_node) 

41 sense_idx, example_text = extract_sense_index(example_text) 

42 if len(example_text) > 0: 

43 translate_raw_tags(example_data) 

44 example_data.text = example_text 

45 calculate_bold_offsets( 

46 wxr, 

47 example_text_node, 

48 example_text, 

49 example_data, 

50 "bold_text_offsets", 

51 extra_node_kind=NodeKind.ITALIC, 

52 ) 

53 if len(sense_idx) > 0: 

54 find_sense = False 

55 for sense in word_entry.senses: 

56 if match_sense_index(sense_idx, sense): 

57 sense.examples.append(example_data) 

58 find_sense = True 

59 if not find_sense: 

60 new_sense = Sense( 

61 sense_index=sense_idx, tags=["no-gloss"] 

62 ) 

63 new_sense.examples.append(example_data) 

64 word_entry.senses.append(new_sense) 

65 last_example = example_data 

66 elif last_example is not None: 66 ↛ 77line 66 didn't jump to line 77 because the condition on line 66 was always true

67 last_example.translation = example_text 

68 calculate_bold_offsets( 

69 wxr, 

70 example_text_node, 

71 example_text, 

72 example_data, 

73 "bold_translation_offsets", 

74 extra_node_kind=NodeKind.ITALIC, 

75 ) 

76 else: 

77 wxr.wtp.debug( 

78 f"Found example data without senseid: {example_data}", 

79 sortid="extractor/de/examples/extract_examples/28", 

80 ) 

81 last_example = None 

82 

83 for non_list_node in level_node.invert_find_child(NodeKind.LIST): 83 ↛ 84line 83 didn't jump to line 84 because the loop on line 83 never started

84 wxr.wtp.debug( 

85 f"Found unexpected non-list node in examples: {non_list_node}", 

86 sortid="extractor/de/examples/extract_examples/33", 

87 ) 

88 

89 

90def extract_reference( 

91 wxr: WiktextractContext, example_data: Example, ref_node: WikiNode 

92): 

93 example_data.ref = clean_node(wxr, None, ref_node.children) 

94 

95 

96def match_sense_index(sense_idx: str, sense: Sense) -> bool: 

97 exact_match = not ( 

98 "," in sense_idx or "-" in sense_idx or "." not in sense_idx 

99 ) 

100 if exact_match: 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true

101 return sense_idx == sense.sense_index 

102 

103 if sense_idx == sense.sense_index: 

104 return True 

105 first_number_str = re.split(r",|\.|-|–", sense.sense_index, maxsplit=1)[0] 

106 first_number = 0 

107 if first_number_str.isdigit(): 

108 first_number = int(first_number_str) 

109 else: 

110 return False 

111 

112 for try_idx in sense_idx.split(","): 

113 try_idx = try_idx.strip() 

114 if try_idx == sense.sense_index: 

115 return True 

116 elif re.fullmatch(r"\d+[\-–]\d+", try_idx): 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true

117 start_str, end_str = re.split(r"-|–", try_idx, maxsplit=1) 

118 if int(start_str) <= first_number and first_number <= int(end_str): 

119 return True 

120 

121 return False