Coverage for src/wiktextract/extractor/ko/linkage.py: 61%

122 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-10-24 07:36 +0000

1import re 

2 

3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .models import Linkage, WordEntry 

8from .section_titles import LINKAGE_SECTIONS 

9from .tags import translate_raw_tags 

10 

11LINKAGE_TEMPLATES = frozenset(["파생어 상자", "합성어 상자"]) 

12 

13 

14def extract_linkage_template( 

15 wxr: WiktextractContext, 

16 word_entry: WordEntry, 

17 node: TemplateNode, 

18 l_type: str, 

19) -> bool: 

20 # https://ko.wiktionary.org/wiki/틀:파생어_상자 

21 # https://ko.wiktionary.org/wiki/틀:합성어_상자 

22 added_data = False 

23 if node.template_name in ["파생어 상자", "합성어 상자"]: 23 ↛ 38line 23 didn't jump to line 38 because the condition on line 23 was always true

24 for key in range(1, 41): 24 ↛ 41line 24 didn't jump to line 41 because the loop on line 24 didn't complete

25 if key not in node.template_parameters: 

26 break 

27 word = clean_node(wxr, None, node.template_parameters[key]) 

28 if word != "": 28 ↛ 24line 28 didn't jump to line 24 because the condition on line 28 was always true

29 getattr(word_entry, l_type).append( 

30 Linkage( 

31 word=word, 

32 sense=word_entry.senses[-1].glosses[-1] 

33 if len(word_entry.senses) > 0 

34 else "", 

35 ) 

36 ) 

37 added_data = True 

38 elif re.fullmatch(r"col\d", node.template_name): 

39 extract_col_template(wxr, word_entry, node, l_type) 

40 

41 return added_data 

42 

43 

44def extract_linkage_section( 

45 wxr: WiktextractContext, 

46 word_entry: WordEntry, 

47 level_node: LevelNode, 

48 linkage_type: str, 

49) -> None: 

50 if linkage_type == "proverbs": 

51 extract_proverb_section(wxr, word_entry, level_node) 

52 else: 

53 from .translation import extract_translation_template 

54 

55 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

56 extract_linkage_list_item( 

57 wxr, word_entry, list_item, linkage_type, True 

58 ) 

59 

60 for t_node in level_node.find_child(NodeKind.TEMPLATE): 60 ↛ 61line 60 didn't jump to line 61 because the loop on line 60 never started

61 extract_linkage_template(wxr, word_entry, t_node, linkage_type) 

62 if t_node.template_name == "외국어": 

63 extract_translation_template(wxr, word_entry, t_node) 

64 

65 

66def extract_linkage_list_item( 

67 wxr: WiktextractContext, 

68 word_entry: WordEntry, 

69 list_item: WikiNode, 

70 linkage_type: str, 

71 in_linkage_section: bool, 

72) -> None: 

73 raw_tag = "" 

74 is_roman = False 

75 for child in list_item.children: 

76 if isinstance(child, str): 

77 if ":" in child: 

78 l_type_str = child[: child.index(":")].strip() 

79 if l_type_str in LINKAGE_SECTIONS: 79 ↛ 75line 79 didn't jump to line 75 because the condition on line 79 was always true

80 linkage_type = LINKAGE_SECTIONS[l_type_str] 

81 else: 

82 m = re.search(r"\(([^()]+)\)", child) 

83 if m is not None: 

84 raw_tag = m.group(1).strip() 

85 is_roman = re.search(r"[a-z]", raw_tag) is not None 

86 

87 for link_node in list_item.find_child(NodeKind.LINK): 

88 word = clean_node(wxr, None, link_node) 

89 if word != "": 89 ↛ 87line 89 didn't jump to line 87 because the condition on line 89 was always true

90 linkage = Linkage( 

91 word=word, 

92 sense=word_entry.senses[-1].glosses[-1] 

93 if len(word_entry.senses) > 0 and not in_linkage_section 

94 else "", 

95 ) 

96 if len(raw_tag) > 0: 

97 if is_roman: 

98 linkage.roman = raw_tag 

99 elif re.fullmatch(r"\d+", raw_tag) is not None: 

100 linkage.sense_index = raw_tag 

101 else: 

102 linkage.raw_tags.append(raw_tag) 

103 translate_raw_tags(linkage) 

104 getattr(word_entry, linkage_type).append(linkage) 

105 

106 if not list_item.contain_node(NodeKind.LINK): 106 ↛ 107line 106 didn't jump to line 107 because the condition on line 106 was never true

107 word = clean_node(wxr, None, list_item.children) 

108 if word != "": 

109 linkage = Linkage( 

110 word=word, 

111 sense=word_entry.senses[-1].glosses[-1] 

112 if len(word_entry.senses) > 0 and not in_linkage_section 

113 else "", 

114 ) 

115 translate_raw_tags(linkage) 

116 getattr(word_entry, linkage_type).append(linkage) 

117 

118 

119def extract_proverb_section( 

120 wxr: WiktextractContext, 

121 word_entry: WordEntry, 

122 level_node: LevelNode, 

123) -> None: 

124 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

125 linkage = Linkage(word="") 

126 for index, child in enumerate(list_item.children): 

127 if isinstance(child, str) and ":" in child: 

128 linkage.word = clean_node(wxr, None, list_item.children[:index]) 

129 linkage.word += child[: child.index(":")].strip() 

130 linkage.sense = child[child.index(":") + 1 :].strip() 

131 linkage.sense += clean_node( 

132 wxr, None, list_item.children[index + 1 :] 

133 ) 

134 break 

135 if linkage.word != "": 

136 word_entry.proverbs.append(linkage) 

137 else: 

138 for t_node in list_item.find_child(NodeKind.TEMPLATE): 

139 if t_node.template_name in ["l", "연결"]: 139 ↛ 138line 139 didn't jump to line 138 because the condition on line 139 was always true

140 extract_l_template(wxr, word_entry, t_node, "proverbs") 

141 

142 

143def extract_l_template( 

144 wxr: WiktextractContext, 

145 word_entry: WordEntry, 

146 t_node: TemplateNode, 

147 linkage_type: str, 

148) -> None: 

149 # https://ko.wiktionary.org/wiki/틀:연결 

150 # https://en.wiktionary.org/wiki/Template:link 

151 for word_arg in [3, 2]: 151 ↛ exitline 151 didn't return from function 'extract_l_template' because the loop on line 151 didn't complete

152 if word_arg in t_node.template_parameters: 

153 word = clean_node(wxr, None, t_node.template_parameters[word_arg]) 

154 if word == "": 154 ↛ 155line 154 didn't jump to line 155 because the condition on line 154 was never true

155 break 

156 linkage = Linkage(word=word) 

157 for sense_arg in ["t", 4]: 157 ↛ 163line 157 didn't jump to line 163 because the loop on line 157 didn't complete

158 if sense_arg in t_node.template_parameters: 158 ↛ 157line 158 didn't jump to line 157 because the condition on line 158 was always true

159 linkage.sense = clean_node( 

160 wxr, None, t_node.template_parameters[sense_arg] 

161 ) 

162 break 

163 getattr(word_entry, linkage_type).append(linkage) 

164 break 

165 

166 

167def extract_col_template( 

168 wxr: WiktextractContext, 

169 word_entry: WordEntry, 

170 t_node: TemplateNode, 

171 l_type: str, 

172): 

173 linkage_list = [] 

174 expanded_template = wxr.wtp.parse( 

175 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

176 ) 

177 for ui_tag in expanded_template.find_html_recursively("li"): 

178 current_data = [] 

179 roman = "" 

180 raw_tags = [] 

181 for span_tag in ui_tag.find_html("span"): 

182 span_lang = span_tag.attrs.get("lang", "") 

183 if span_lang.endswith("-Latn"): 

184 roman = clean_node(wxr, None, span_tag) 

185 elif "qualifier-content" in span_tag.attrs.get("class", ""): 

186 span_text = clean_node(wxr, None, span_tag) 

187 for raw_tag in span_text.split(","): 

188 raw_tag = raw_tag.strip() 

189 if raw_tag != "": 

190 raw_tags.append(raw_tag) 

191 elif span_lang != "": 

192 l_data = Linkage(word=clean_node(wxr, None, span_tag)) 

193 class_names = span_tag.attrs.get("class", "") 

194 if class_names == "Hant": 

195 l_data.tags.append("Traditional-Chinese") 

196 elif class_names == "Hans": 

197 l_data.tags.append("Simplified-Chinese") 

198 if l_data.word != "": 

199 current_data.append(l_data) 

200 

201 for data in current_data: 

202 data.raw_tags.extend(raw_tags) 

203 data.roman = roman 

204 translate_raw_tags(data) 

205 linkage_list.extend(current_data) 

206 

207 getattr(word_entry, l_type).extend(linkage_list)