Coverage for src/wiktextract/extractor/ms/example.py: 99%

53 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1import re 

2 

3from wikitextprocessor import NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from ..share import calculate_bold_offsets 

8from .linkage import LINKAGE_TEMPLATES, extract_nyms_template 

9from .models import Example, Sense, WordEntry 

10 

11 

12def extract_example_list_item( 

13 wxr: WiktextractContext, 

14 word_entry: WordEntry, 

15 sense: Sense, 

16 list_item: WikiNode, 

17 parent_e_data: Example | None = None, 

18) -> None: 

19 e_data = Example(text="") if parent_e_data is None else parent_e_data 

20 is_first_node = True 

21 is_ref = False 

22 for node in list_item.children: 

23 if isinstance(node, TemplateNode) and ( 

24 node.template_name 

25 in ["cp", "usex", "ux", "ko-usex", "uxi", "quote"] 

26 or node.template_name.startswith("quote-") 

27 ): 

28 extract_cp_template(wxr, sense, node, e_data) 

29 elif ( 

30 isinstance(node, TemplateNode) 

31 and node.template_name in LINKAGE_TEMPLATES 

32 ): 

33 extract_nyms_template(wxr, word_entry, node) 

34 elif isinstance(node, WikiNode): 

35 if node.kind == NodeKind.ITALIC and not is_ref: 

36 if parent_e_data is None: 

37 e_data.text = clean_node(wxr, sense, node) 

38 calculate_bold_offsets( 

39 wxr, node, e_data.text, e_data, "bold_text_offsets" 

40 ) 

41 else: 

42 e_data.translation = clean_node(wxr, sense, node) 

43 calculate_bold_offsets( 

44 wxr, 

45 node, 

46 e_data.translation, 

47 e_data, 

48 "bold_translation_offsets", 

49 ) 

50 elif node.kind == NodeKind.LIST: 

51 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

52 extract_example_list_item( 

53 wxr, word_entry, sense, child_list_item, e_data 

54 ) 

55 elif is_first_node and node.kind == NodeKind.BOLD: 

56 bold_text = clean_node(wxr, None, node) 

57 if re.fullmatch(r"\d{4}", bold_text): 57 ↛ 64line 57 didn't jump to line 64 because the condition on line 57 was always true

58 e_data.ref = clean_node( 

59 wxr, 

60 sense, 

61 list(list_item.invert_find_child(NodeKind.LIST)), 

62 ) 

63 is_ref = True 

64 is_first_node = False 

65 

66 if e_data.text != "" and parent_e_data is None: 

67 sense.examples.append(e_data) 

68 

69 

70def extract_cp_template( 

71 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode, e_data: Example 

72) -> None: 

73 expanded_template = wxr.wtp.parse( 

74 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

75 ) 

76 for html_tag in expanded_template.find_child_recursively(NodeKind.HTML): 

77 html_class = html_tag.attrs.get("class", "") 

78 if "e-example" in html_class or "e-quotation" in html_class: 

79 e_data.text = clean_node(wxr, None, html_tag) 

80 calculate_bold_offsets( 

81 wxr, html_tag, e_data.text, e_data, "bold_text_offsets" 

82 ) 

83 elif "e-transliteration" in html_class: 

84 e_data.roman = clean_node(wxr, None, html_tag) 

85 calculate_bold_offsets( 

86 wxr, html_tag, e_data.roman, e_data, "bold_roman_offsets" 

87 ) 

88 elif "e-translation" in html_class: 

89 e_data.translation = clean_node(wxr, None, html_tag) 

90 calculate_bold_offsets( 

91 wxr, 

92 html_tag, 

93 e_data.translation, 

94 e_data, 

95 "bold_translation_offsets", 

96 ) 

97 elif "e-literally" in html_class: 

98 e_data.literal_meaning = clean_node(wxr, None, html_tag) 

99 calculate_bold_offsets( 

100 wxr, 

101 html_tag, 

102 e_data.literal_meaning, 

103 e_data, 

104 "bold_literal_offsets", 

105 ) 

106 elif "cited-source" in html_class: 

107 e_data.ref = clean_node(wxr, None, html_tag) 

108 

109 clean_node(wxr, sense, expanded_template)