Coverage for src/wiktextract/extractor/ms/example.py: 99%

1import re

3from wikitextprocessor import NodeKind, TemplateNode, WikiNode

5from ...page import clean_node

6from ...wxr_context import WiktextractContext

7from ..share import calculate_bold_offsets

8from .linkage import LINKAGE_TEMPLATES, extract_nyms_template

9from .models import Example, Sense, WordEntry

12def extract_example_list_item(

13 wxr: WiktextractContext,

14 word_entry: WordEntry,

15 sense: Sense,

16 list_item: WikiNode,

17 parent_e_data: Example | None = None,

18) -> None:

19 e_data = Example(text="") if parent_e_data is None else parent_e_data

20 is_first_node = True

21 is_ref = False

22 for node in list_item.children:

23 if isinstance(node, TemplateNode) and (

24 node.template_name

25 in ["cp", "usex", "ux", "ko-usex", "uxi", "quote"]

26 or node.template_name.startswith("quote-")

27 ):

28 extract_cp_template(wxr, sense, node, e_data)

29 elif (

30 isinstance(node, TemplateNode)

31 and node.template_name in LINKAGE_TEMPLATES

32 ):

33 extract_nyms_template(wxr, word_entry, node)

34 elif isinstance(node, WikiNode):

35 if node.kind == NodeKind.ITALIC and not is_ref:

36 if parent_e_data is None:

37 e_data.text = clean_node(wxr, sense, node)

38 calculate_bold_offsets(

39 wxr, node, e_data.text, e_data, "bold_text_offsets"

40 )

41 else:

42 e_data.translation = clean_node(wxr, sense, node)

43 calculate_bold_offsets(

44 wxr,

45 node,

46 e_data.translation,

47 e_data,

48 "bold_translation_offsets",

49 )

50 elif node.kind == NodeKind.LIST:

51 for child_list_item in node.find_child(NodeKind.LIST_ITEM):

52 extract_example_list_item(

53 wxr, word_entry, sense, child_list_item, e_data

54 )

55 elif is_first_node and node.kind == NodeKind.BOLD:

56 bold_text = clean_node(wxr, None, node)

57 if re.fullmatch(r"\d{4}", bold_text): 57 ↛ 64line 57 didn't jump to line 64 because the condition on line 57 was always true

58 e_data.ref = clean_node(

59 wxr,

60 sense,

61 list(list_item.invert_find_child(NodeKind.LIST)),

62 )

63 is_ref = True

64 is_first_node = False

66 if e_data.text != "" and parent_e_data is None:

67 sense.examples.append(e_data)

70def extract_cp_template(

71 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode, e_data: Example

72) -> None:

73 expanded_template = wxr.wtp.parse(

74 wxr.wtp.node_to_wikitext(t_node), expand_all=True

75 )

76 for html_tag in expanded_template.find_child_recursively(NodeKind.HTML):

77 html_class = html_tag.attrs.get("class", "")

78 if "e-example" in html_class or "e-quotation" in html_class:

79 e_data.text = clean_node(wxr, None, html_tag)

80 calculate_bold_offsets(

81 wxr, html_tag, e_data.text, e_data, "bold_text_offsets"

82 )

83 elif "e-transliteration" in html_class:

84 e_data.roman = clean_node(wxr, None, html_tag)

85 calculate_bold_offsets(

86 wxr, html_tag, e_data.roman, e_data, "bold_roman_offsets"

87 )

88 elif "e-translation" in html_class:

89 e_data.translation = clean_node(wxr, None, html_tag)

90 calculate_bold_offsets(

91 wxr,

92 html_tag,

93 e_data.translation,

94 e_data,

95 "bold_translation_offsets",

96 )

97 elif "e-literally" in html_class:

98 e_data.literal_meaning = clean_node(wxr, None, html_tag)

99 calculate_bold_offsets(

100 wxr,

101 html_tag,

102 e_data.literal_meaning,

103 e_data,

104 "bold_literal_offsets",

105 )

106 elif "cited-source" in html_class:

107 e_data.ref = clean_node(wxr, None, html_tag)

108

109 clean_node(wxr, sense, expanded_template)