Coverage for src/wiktextract/extractor/pt/pos.py: 94%

1import re

3from wikitextprocessor.parser import (

4 LEVEL_KIND_FLAGS,

5 LevelNode,

6 NodeKind,

7 TemplateNode,

8 WikiNode,

11from ...page import clean_node

12from ...wxr_context import WiktextractContext

13from .example import extract_example_list_item

14from .head_line import extract_head_line_nodes

15from .inflection import extract_flex_template

16from .models import AltForm, Linkage, Sense, WordEntry

17from .section_titles import POS_DATA

18from .tags import translate_raw_tags

21def extract_pos_section(

22 wxr: WiktextractContext,

23 page_data: list[WordEntry],

24 base_data: WordEntry,

25 level_node: LevelNode,

26 pos_title: str,

27 categories: list[str],

28) -> None:

29 page_data.append(base_data.model_copy(deep=True))

30 page_data[-1].pos_title = pos_title

31 pos_data = POS_DATA[pos_title.lower()]

32 page_data[-1].pos = pos_data["pos"]

33 page_data[-1].tags.extend(pos_data.get("tags", []))

34 page_data[-1].categories.extend(categories)

36 first_gloss_index = len(level_node.children)

37 for index, list_node in level_node.find_child(NodeKind.LIST, True):

38 if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"):

39 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

40 extract_gloss_list_item(wxr, page_data[-1], list_item)

41 if index < first_gloss_index: 41 ↛ 37line 41 didn't jump to line 37 because the condition on line 41 was always true

42 first_gloss_index = index

43 extract_head_line_nodes(

44 wxr, page_data[-1], level_node.children[:first_gloss_index]

45 )

46 # forms table template may not in header line

47 for t_node in level_node.find_child(NodeKind.TEMPLATE):

48 if t_node.template_name.startswith("flex."):

49 extract_flex_template(wxr, page_data[-1], t_node)

51 base_data_pos = page_data[-1].model_copy(deep=True)

52 first_child_section = True

53 for child_level_node in level_node.find_child(LEVEL_KIND_FLAGS):

54 child_section = clean_node(wxr, None, child_level_node.largs)

55 if child_section in ["Brasil", "Portugal"]:

56 page_data.append(base_data_pos.model_copy(deep=True))

57 if first_child_section:

58 page_data.pop()

59 first_child_section = False

60 page_data[-1].raw_tags.append(child_section)

61 for list_node in child_level_node.find_child(NodeKind.LIST):

62 if list_node.sarg.startswith("#") and list_node.sarg.endswith( 62 ↛ 61line 62 didn't jump to line 61 because the condition on line 62 was always true

63 "#"

64 ):

65 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

66 extract_gloss_list_item(wxr, page_data[-1], list_item)

67 translate_raw_tags(page_data[-1])

70def extract_gloss_list_item(

71 wxr: WiktextractContext,

72 word_entry: WordEntry | Linkage,

73 list_item: WikiNode,

74 parent_gloss: list[str] = [],

75) -> None:

76 gloss_nodes = []

77 sense = Sense(glosses=parent_gloss)

78 for node in list_item.children:

79 if isinstance(node, TemplateNode):

80 if node.template_name == "escopo":

81 extract_escopo_template(wxr, sense, node)

82 elif node.template_name == "escopo2": 82 ↛ 85line 82 didn't jump to line 85 because the condition on line 82 was always true

83 sense.raw_tags.extend(extract_escopo2_template(wxr, node))

84 else:

85 gloss_nodes.append(node)

86 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:

87 if node.sarg.endswith(("*", ":")):

88 for next_list_item in node.find_child(NodeKind.LIST_ITEM):

89 extract_example_list_item(wxr, sense, next_list_item)

90 else:

91 gloss_nodes.append(node)

93 gloss_str = clean_node(wxr, sense, gloss_nodes)

94 if len(gloss_str) > 0: 94 ↛ 101line 94 didn't jump to line 101 because the condition on line 94 was always true

95 sense.glosses.append(gloss_str)

96 translate_raw_tags(sense)

97 if "form-of" in word_entry.tags:

98 extract_form_of_word(wxr, sense, list_item)

99 word_entry.senses.append(sense)

100

101 for child_list in list_item.find_child(NodeKind.LIST):

102 if child_list.sarg.endswith("#"):

103 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):

104 extract_gloss_list_item(

105 wxr, word_entry, child_list_item, sense.glosses

106 )

107

108

109def extract_escopo_template(

110 wxr: WiktextractContext,

111 sense: Sense,

112 t_node: TemplateNode,

113) -> None:

114 # https://pt.wiktionary.org/wiki/Predefinição:escopo

115 expanded_str = clean_node(wxr, sense, t_node).strip("()")

116 for raw_tag in re.split(r", | e ", expanded_str):

117 if raw_tag.strip() != "": 117 ↛ 116line 117 didn't jump to line 116 because the condition on line 117 was always true

118 sense.raw_tags.append(raw_tag.strip())

119

120

121def extract_escopo2_template(

122 wxr: WiktextractContext,

123 t_node: TemplateNode,

124) -> list[str]:

125 # https://pt.wiktionary.org/wiki/Predefinição:escopo2

126 raw_tags = []

127 for arg in range(1, 4): 127 ↛ 133line 127 didn't jump to line 133 because the loop on line 127 didn't complete

128 if arg not in t_node.template_parameters:

129 break

130 raw_tag = clean_node(wxr, None, t_node.template_parameters[arg])

131 if raw_tag != "": 131 ↛ 127line 131 didn't jump to line 127 because the condition on line 131 was always true

132 raw_tags.append(raw_tag)

133 return raw_tags

134

135

136def extract_form_of_word(

137 wxr: WiktextractContext, sense: Sense, list_item: WikiNode

138) -> None:

139 form_of = ""

140 for link_node in list_item.find_child_recursively(NodeKind.LINK):

141 form_of = clean_node(wxr, None, link_node)

142 if form_of != "": 142 ↛ exitline 142 didn't return from function 'extract_form_of_word' because the condition on line 142 was always true

143 sense.form_of.append(AltForm(word=form_of))