Coverage for src/wiktextract/extractor/pt/pos.py: 94%

89 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-06-13 07:43 +0000

1import re 

2 

3from wikitextprocessor.parser import ( 

4 LEVEL_KIND_FLAGS, 

5 LevelNode, 

6 NodeKind, 

7 TemplateNode, 

8 WikiNode, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from .example import extract_example_list_item 

14from .head_line import extract_head_line_nodes 

15from .inflection import extract_flex_template 

16from .models import AltForm, Linkage, Sense, WordEntry 

17from .section_titles import POS_DATA 

18from .tags import translate_raw_tags 

19 

20 

21def extract_pos_section( 

22 wxr: WiktextractContext, 

23 page_data: list[WordEntry], 

24 base_data: WordEntry, 

25 level_node: LevelNode, 

26 pos_title: str, 

27 categories: list[str], 

28) -> None: 

29 page_data.append(base_data.model_copy(deep=True)) 

30 page_data[-1].pos_title = pos_title 

31 pos_data = POS_DATA[pos_title.lower()] 

32 page_data[-1].pos = pos_data["pos"] 

33 page_data[-1].tags.extend(pos_data.get("tags", [])) 

34 page_data[-1].categories.extend(categories) 

35 

36 first_gloss_index = len(level_node.children) 

37 for index, list_node in level_node.find_child(NodeKind.LIST, True): 

38 if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"): 

39 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

40 extract_gloss_list_item(wxr, page_data[-1], list_item) 

41 if index < first_gloss_index: 41 ↛ 37line 41 didn't jump to line 37 because the condition on line 41 was always true

42 first_gloss_index = index 

43 extract_head_line_nodes( 

44 wxr, page_data[-1], level_node.children[:first_gloss_index] 

45 ) 

46 # forms table template may not in header line 

47 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

48 if t_node.template_name.startswith("flex."): 

49 extract_flex_template(wxr, page_data[-1], t_node) 

50 

51 base_data_pos = page_data[-1].model_copy(deep=True) 

52 first_child_section = True 

53 for child_level_node in level_node.find_child(LEVEL_KIND_FLAGS): 

54 child_section = clean_node(wxr, None, child_level_node.largs) 

55 if child_section in ["Brasil", "Portugal"]: 

56 page_data.append(base_data_pos.model_copy(deep=True)) 

57 if first_child_section: 

58 page_data.pop() 

59 first_child_section = False 

60 page_data[-1].raw_tags.append(child_section) 

61 for list_node in child_level_node.find_child(NodeKind.LIST): 

62 if list_node.sarg.startswith("#") and list_node.sarg.endswith( 62 ↛ 61line 62 didn't jump to line 61 because the condition on line 62 was always true

63 "#" 

64 ): 

65 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

66 extract_gloss_list_item(wxr, page_data[-1], list_item) 

67 translate_raw_tags(page_data[-1]) 

68 

69 

70def extract_gloss_list_item( 

71 wxr: WiktextractContext, 

72 word_entry: WordEntry | Linkage, 

73 list_item: WikiNode, 

74 parent_gloss: list[str] = [], 

75) -> None: 

76 gloss_nodes = [] 

77 sense = Sense(glosses=parent_gloss) 

78 for node in list_item.children: 

79 if isinstance(node, TemplateNode): 

80 if node.template_name == "escopo": 

81 extract_escopo_template(wxr, sense, node) 

82 elif node.template_name == "escopo2": 82 ↛ 85line 82 didn't jump to line 85 because the condition on line 82 was always true

83 sense.raw_tags.extend(extract_escopo2_template(wxr, node)) 

84 else: 

85 gloss_nodes.append(node) 

86 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

87 if node.sarg.endswith(("*", ":")): 

88 for next_list_item in node.find_child(NodeKind.LIST_ITEM): 

89 extract_example_list_item(wxr, sense, next_list_item) 

90 else: 

91 gloss_nodes.append(node) 

92 

93 gloss_str = clean_node(wxr, sense, gloss_nodes) 

94 if len(gloss_str) > 0: 94 ↛ 101line 94 didn't jump to line 101 because the condition on line 94 was always true

95 sense.glosses.append(gloss_str) 

96 translate_raw_tags(sense) 

97 if "form-of" in word_entry.tags: 

98 extract_form_of_word(wxr, sense, list_item) 

99 word_entry.senses.append(sense) 

100 

101 for child_list in list_item.find_child(NodeKind.LIST): 

102 if child_list.sarg.endswith("#"): 

103 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

104 extract_gloss_list_item( 

105 wxr, word_entry, child_list_item, sense.glosses 

106 ) 

107 

108 

109def extract_escopo_template( 

110 wxr: WiktextractContext, 

111 sense: Sense, 

112 t_node: TemplateNode, 

113) -> None: 

114 # https://pt.wiktionary.org/wiki/Predefinição:escopo 

115 expanded_str = clean_node(wxr, sense, t_node).strip("()") 

116 for raw_tag in re.split(r", | e ", expanded_str): 

117 if raw_tag.strip() != "": 117 ↛ 116line 117 didn't jump to line 116 because the condition on line 117 was always true

118 sense.raw_tags.append(raw_tag.strip()) 

119 

120 

121def extract_escopo2_template( 

122 wxr: WiktextractContext, 

123 t_node: TemplateNode, 

124) -> list[str]: 

125 # https://pt.wiktionary.org/wiki/Predefinição:escopo2 

126 raw_tags = [] 

127 for arg in range(1, 4): 127 ↛ 133line 127 didn't jump to line 133 because the loop on line 127 didn't complete

128 if arg not in t_node.template_parameters: 

129 break 

130 raw_tag = clean_node(wxr, None, t_node.template_parameters[arg]) 

131 if raw_tag != "": 131 ↛ 127line 131 didn't jump to line 127 because the condition on line 131 was always true

132 raw_tags.append(raw_tag) 

133 return raw_tags 

134 

135 

136def extract_form_of_word( 

137 wxr: WiktextractContext, sense: Sense, list_item: WikiNode 

138) -> None: 

139 form_of = "" 

140 for link_node in list_item.find_child_recursively(NodeKind.LINK): 

141 form_of = clean_node(wxr, None, link_node) 

142 if form_of != "": 142 ↛ exitline 142 didn't return from function 'extract_form_of_word' because the condition on line 142 was always true

143 sense.form_of.append(AltForm(word=form_of))