Coverage for src/wiktextract/extractor/ms/example.py: 99%

53 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1import re 

2 

3from wikitextprocessor import NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from ..share import calculate_bold_offsets 

8from .linkage import LINKAGE_TEMPLATES, extract_nyms_template 

9from .models import Example, Sense, WordEntry 

10 

11 

12def extract_example_list_item( 

13 wxr: WiktextractContext, 

14 word_entry: WordEntry, 

15 sense: Sense, 

16 list_item: WikiNode, 

17 parent_e_data: Example | None = None, 

18) -> None: 

19 e_data = Example(text="") if parent_e_data is None else parent_e_data 

20 is_first_node = True 

21 is_ref = False 

22 for node in list_item.children: 

23 if isinstance(node, TemplateNode) and ( 

24 node.template_name 

25 in ["cp", "usex", "ux", "ko-usex", "uxi", "quote"] 

26 or node.template_name.startswith("quote-") 

27 ): 

28 extract_cp_template(wxr, sense, node, e_data) 

29 elif ( 

30 isinstance(node, TemplateNode) 

31 and node.template_name in LINKAGE_TEMPLATES 

32 ): 

33 extract_nyms_template(wxr, word_entry, node) 

34 elif isinstance(node, WikiNode): 

35 if node.kind == NodeKind.ITALIC and not is_ref: 

36 if parent_e_data is None: 

37 e_data.text = clean_node(wxr, sense, node) 

38 calculate_bold_offsets( 

39 wxr, node, e_data.text, e_data, "bold_text_offsets" 

40 ) 

41 else: 

42 e_data.translation = clean_node(wxr, sense, node) 

43 calculate_bold_offsets( 

44 wxr, 

45 node, 

46 e_data.translation, 

47 e_data, 

48 "bold_translation_offsets", 

49 ) 

50 elif node.kind == NodeKind.LIST: 

51 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

52 extract_example_list_item( 

53 wxr, word_entry, sense, child_list_item, e_data 

54 ) 

55 elif is_first_node and node.kind == NodeKind.BOLD: 

56 bold_text = clean_node(wxr, None, node) 

57 if re.fullmatch(r"\d{4}", bold_text): 57 ↛ 68line 57 didn't jump to line 68 because the condition on line 57 was always true

58 e_data.ref = clean_node( 

59 wxr, 

60 sense, 

61 list( 

62 list_item.invert_find_child( 

63 NodeKind.LIST, include_empty_str=True 

64 ) 

65 ), 

66 ) 

67 is_ref = True 

68 is_first_node = False 

69 

70 if e_data.text != "" and parent_e_data is None: 

71 sense.examples.append(e_data) 

72 

73 

74def extract_cp_template( 

75 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode, e_data: Example 

76) -> None: 

77 expanded_template = wxr.wtp.parse( 

78 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

79 ) 

80 for html_tag in expanded_template.find_child_recursively(NodeKind.HTML): 

81 html_class = html_tag.attrs.get("class", "") 

82 if "e-example" in html_class or "e-quotation" in html_class: 

83 e_data.text = clean_node(wxr, None, html_tag) 

84 calculate_bold_offsets( 

85 wxr, html_tag, e_data.text, e_data, "bold_text_offsets" 

86 ) 

87 elif "e-transliteration" in html_class: 

88 e_data.roman = clean_node(wxr, None, html_tag) 

89 calculate_bold_offsets( 

90 wxr, html_tag, e_data.roman, e_data, "bold_roman_offsets" 

91 ) 

92 elif "e-translation" in html_class: 

93 e_data.translation = clean_node(wxr, None, html_tag) 

94 calculate_bold_offsets( 

95 wxr, 

96 html_tag, 

97 e_data.translation, 

98 e_data, 

99 "bold_translation_offsets", 

100 ) 

101 elif "e-literally" in html_class: 

102 e_data.literal_meaning = clean_node(wxr, None, html_tag) 

103 calculate_bold_offsets( 

104 wxr, 

105 html_tag, 

106 e_data.literal_meaning, 

107 e_data, 

108 "bold_literal_offsets", 

109 ) 

110 elif "cited-source" in html_class: 

111 e_data.ref = clean_node(wxr, None, html_tag) 

112 

113 clean_node(wxr, sense, expanded_template)