Coverage for src/wiktextract/extractor/ms/linkage.py: 86%

1from collections import defaultdict

3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode

5from ...page import clean_node

6from ...wxr_context import WiktextractContext

7from .models import Form, Linkage, WordEntry

8from .section_titles import LINKAGE_SECTIONS

11def extract_form_section(

12 wxr: WiktextractContext,

13 word_entry: WordEntry,

14 level_node: LevelNode,

15 tags: list[str],

16) -> None:

17 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LINK):

18 if ( 18 ↛ 17line 18 didn't jump to line 17 because the condition on line 18 was always true

19 isinstance(node, TemplateNode)

20 and node.template_name in ["ARchar", "Arab", "PSchar", "SDchar"]

21 ) or node.kind == NodeKind.LINK:

22 word = clean_node(wxr, None, node)

23 if word != "": 23 ↛ 17line 23 didn't jump to line 17 because the condition on line 23 was always true

24 word_entry.forms.append(Form(form=word, tags=tags))

25 for list_node in level_node.find_child(NodeKind.LIST): 25 ↛ 26line 25 didn't jump to line 26 because the loop on line 25 never started

26 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

27 for node in list_item.find_child(NodeKind.LINK):

28 word = clean_node(wxr, None, node)

29 if word != "":

30 word_entry.forms.append(Form(form=word, tags=tags))

33def extract_linkage_section(

34 wxr: WiktextractContext,

35 page_data: list[WordEntry],

36 base_data: WordEntry,

37 level_node: LevelNode,

38) -> None:

39 l_dict = defaultdict(list)

40 linkage_name = clean_node(wxr, None, level_node.largs).lower()

41 for list_node in level_node.find_child(NodeKind.LIST):

42 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

43 new_l_name = extract_linkage_list_item(

44 wxr, l_dict, linkage_name, list_item

45 )

46 if new_l_name != "": 46 ↛ 42line 46 didn't jump to line 42 because the condition on line 46 was always true

47 linkage_name = new_l_name

49 if len(page_data) == 0 or page_data[-1].lang_code != base_data.lang_code:

50 for field, data in l_dict.items():

51 getattr(base_data, field).extend(data)

52 elif level_node.kind == NodeKind.LEVEL3:

53 for data in page_data:

54 if data.lang_code == page_data[-1].lang_code: 54 ↛ 53line 54 didn't jump to line 53 because the condition on line 54 was always true

55 for field, l_data in l_dict.items():

56 getattr(data, field).extend(l_data)

57 else:

58 for field, l_data in l_dict.items():

59 getattr(page_data[-1], field).extend(l_data)

62def extract_linkage_list_item(

63 wxr: WiktextractContext,

64 l_dict: dict[str, list[Linkage]],

65 linkage_name: str,

66 list_item: WikiNode,

67) -> str:

68 if list_item.definition is not None and len(list_item.definition) > 0:

69 linkage_name = clean_node(wxr, None, list_item.children).lower()

70 if linkage_name not in LINKAGE_SECTIONS: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true

71 return ""

72 for node in list_item.definition:

73 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:

74 word = clean_node(wxr, None, node)

75 if word != "": 75 ↛ 72line 75 didn't jump to line 72 because the condition on line 75 was always true

76 l_dict[LINKAGE_SECTIONS[linkage_name]].append(

77 Linkage(word=word)

78 )

79 elif isinstance(node, str): 79 ↛ 72line 79 didn't jump to line 72 because the condition on line 79 was always true

80 for word in node.split(","):

81 word = word.strip(" .\n")

82 if word != "":

83 l_dict[LINKAGE_SECTIONS[linkage_name]].append(

84 Linkage(word=word)

85 )

86 elif list_item.contain_node(NodeKind.BOLD):

87 extract_proverb_list(

88 wxr, l_dict, list_item, LINKAGE_SECTIONS[linkage_name]

89 )

90 else:

91 sense = ""

92 for node in list_item.children:

93 if isinstance(node, TemplateNode) and node.template_name == "sense":

94 sense = clean_node(wxr, None, node).strip("(): ")

95 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:

96 word = clean_node(wxr, None, node)

97 if word != "" and linkage_name in LINKAGE_SECTIONS: 97 ↛ 92line 97 didn't jump to line 92 because the condition on line 97 was always true

98 l_dict[LINKAGE_SECTIONS[linkage_name]].append(

99 Linkage(word=word, sense=sense)

100 )

101 elif isinstance(node, str) and node.strip().endswith(":"):

102 new_linkage_name = node.strip("(): ").lower()

103 if new_linkage_name in LINKAGE_SECTIONS: 103 ↛ 92line 103 didn't jump to line 92 because the condition on line 103 was always true

104 linkage_name = new_linkage_name

105

106 return linkage_name

107

108

109LINKAGE_TEMPLATES = {

110 "antonim": "antonyms",

111 "ant": "antonyms",

112 "antonyms": "antonyms",

113 "sinonim": "synonyms",

114 "synonyms": "synonyms",

115 "syn": "synonyms",

116 "sin": "synonyms",

117 "hypernyms": "hypernyms",

118 "hyper": "hypernyms",

119 "kata setara": "coordinate_terms",

120 "coordinate terms": "coordinate_terms",

121 "perkataan koordinat": "coordinate_terms",

122 "cot": "coordinate_terms",

123 "hiponim": "hyponyms",

124 "hipo": "hyponyms",

125 "hyponyms": "hyponyms",

126}

127

128

129def extract_nyms_template(

130 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode

131):

132 # Modul:nyms

133 expanded_node = wxr.wtp.parse(

134 wxr.wtp.node_to_wikitext(t_node), expand_all=True

135 )

136 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

137 for span_tag in expanded_node.find_html_recursively("span"):

138 if lang_code == span_tag.attrs.get("lang", ""):

139 word = clean_node(wxr, None, span_tag)

140 if word != "": 140 ↛ 137line 140 didn't jump to line 137 because the condition on line 140 was always true

141 l_data = Linkage(word=word)

142 if ( 142 ↛ 147line 142 didn't jump to line 147 because the condition on line 142 was always true

143 len(word_entry.senses) > 0

144 and len(word_entry.senses[-1].glosses) > 0

145 ):

146 l_data.sense = " ".join(word_entry.senses[-1].glosses)

147 getattr(

148 word_entry, LINKAGE_TEMPLATES[t_node.template_name]

149 ).append(l_data)

150

151

152def extract_proverb_list(

153 wxr: WiktextractContext,

154 l_dict: dict[str, list[Linkage]],

155 list_item: WikiNode,

156 linkage_type: str,

157) -> None:

158 proverbs = []

159 after_bold = False

160 sense = ""

161 for index, node in enumerate(list_item.children):

162 if isinstance(node, WikiNode) and node.kind == NodeKind.BOLD:

163 proverb = clean_node(wxr, None, node)

164 if proverb != "": 164 ↛ 166line 164 didn't jump to line 166 because the condition on line 164 was always true

165 proverbs.append(proverb)

166 after_bold = True

167 elif after_bold and isinstance(node, str) and ":" in node:

168 sense = clean_node(

169 wxr,

170 None,

171 [node[node.index(":") + 1 :]] + list_item.children[index + 1 :],

172 )

173 for proverb in proverbs:

174 l_dict[linkage_type].append(Linkage(word=proverb, sense=sense))