Coverage for src/wiktextract/extractor/ko/pos.py: 93%

97 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1import re 

2 

3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .example import extract_example_list_item 

8from .linkage import ( 

9 LINKAGE_TEMPLATES, 

10 extract_linkage_list_item, 

11 extract_linkage_template, 

12) 

13from .models import AltForm, Sense, WordEntry 

14from .section_titles import LINKAGE_SECTIONS, POS_DATA 

15from .sound import SOUND_TEMPLATES, extract_sound_template 

16from .tags import translate_raw_tags 

17from .translation import extract_translation_template 

18 

19 

20def extract_pos_section( 

21 wxr: WiktextractContext, 

22 page_data: list[WordEntry], 

23 base_data: WordEntry, 

24 level_node: LevelNode, 

25 pos_title: str, 

26) -> None: 

27 page_data.append(base_data.model_copy(deep=True)) 

28 orig_title = pos_title 

29 pos_title = pos_title.removeprefix("보조 ").strip() 

30 if pos_title in POS_DATA: 

31 page_data[-1].pos_title = orig_title 

32 pos_data = POS_DATA[pos_title] 

33 page_data[-1].pos = pos_data["pos"] 

34 page_data[-1].tags.extend(pos_data.get("tags", [])) 

35 if ( 35 ↛ 39line 35 didn't jump to line 39

36 orig_title.startswith("보조 ") 

37 and "auxiliary" not in page_data[-1].tags 

38 ): 

39 page_data[-1].tags.append("auxiliary") 

40 

41 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE): 

42 if isinstance(node, TemplateNode): 

43 if node.template_name in SOUND_TEMPLATES: 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true

44 extract_sound_template(wxr, page_data[-1], node) 

45 elif node.template_name in LINKAGE_TEMPLATES: 

46 extract_linkage_template(wxr, page_data[-1], node) 

47 elif node.template_name == "외국어": 47 ↛ 41line 47 didn't jump to line 41 because the condition on line 47 was always true

48 extract_translation_template( 

49 wxr, 

50 page_data[-1], 

51 node, 

52 page_data[-1].senses[-1].glosses[-1] 

53 if len(page_data[-1].senses) > 0 

54 else "", 

55 ) 

56 elif node.kind == NodeKind.LIST: 56 ↛ 41line 56 didn't jump to line 41 because the condition on line 56 was always true

57 for list_item in node.find_child(NodeKind.LIST_ITEM): 

58 if node.sarg.startswith("#"): 

59 extract_gloss_list_item(wxr, page_data[-1], list_item) 

60 else: 

61 extract_unorderd_list_item(wxr, page_data[-1], list_item) 

62 

63 if len(page_data[-1].senses) == 0: 

64 page_data.pop() 

65 

66 

67def extract_gloss_list_item( 

68 wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode 

69) -> None: 

70 gloss_nodes = [] 

71 sense = Sense() 

72 for node in list_item.children: 

73 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

74 gloss_text = clean_node(wxr, sense, gloss_nodes) 

75 if len(gloss_text) > 0: 75 ↛ 79line 75 didn't jump to line 79 because the condition on line 75 was always true

76 sense.glosses.append(gloss_text) 

77 word_entry.senses.append(sense) 

78 gloss_nodes.clear() 

79 for nested_list_item in node.find_child(NodeKind.LIST_ITEM): 

80 extract_unorderd_list_item(wxr, word_entry, nested_list_item) 

81 continue 

82 elif isinstance(node, TemplateNode) and node.template_name.endswith( 

83 " of" 

84 ): 

85 extract_form_of_template(wxr, sense, node) 

86 gloss_nodes.append(node) 

87 elif isinstance(node, TemplateNode) and node.template_name == "라벨": 

88 sense.raw_tags.extend( 

89 [ 

90 raw_tag.strip() 

91 for raw_tag in clean_node(wxr, sense, node) 

92 .strip("()") 

93 .split(",") 

94 ] 

95 ) 

96 else: 

97 gloss_nodes.append(node) 

98 

99 gloss_text = clean_node(wxr, sense, gloss_nodes) 

100 if len(gloss_text) > 0: 

101 sense.glosses.append(gloss_text) 

102 translate_raw_tags(sense) 

103 word_entry.senses.append(sense) 

104 

105 

106def extract_unorderd_list_item( 

107 wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode 

108) -> None: 

109 is_first_bold = True 

110 for index, node in enumerate(list_item.children): 

111 if ( 

112 isinstance(node, WikiNode) 

113 and node.kind == NodeKind.BOLD 

114 and is_first_bold 

115 ): 

116 # `* '''1.''' gloss text`, terrible obsolete layout 

117 is_first_bold = False 

118 bold_text = clean_node(wxr, None, node) 

119 if re.fullmatch(r"\d+(?:-\d+)?\.?", bold_text): 

120 new_list_item = WikiNode(NodeKind.LIST_ITEM, 0) 

121 new_list_item.children = list_item.children[index + 1 :] 

122 extract_gloss_list_item(wxr, word_entry, new_list_item) 

123 break 

124 elif isinstance(node, str) and "어원:" in node: 

125 etymology_nodes = [] 

126 etymology_nodes.append(node[node.index(":") + 1 :]) 

127 etymology_nodes.extend(list_item.children[index + 1 :]) 

128 e_text = clean_node(wxr, None, etymology_nodes) 

129 if len(e_text) > 0: 129 ↛ 131line 129 didn't jump to line 131 because the condition on line 129 was always true

130 word_entry.etymology_texts.append(e_text) 

131 break 

132 elif ( 

133 isinstance(node, str) 

134 and ("참고:" in node or "참조:" in node) 

135 and len(word_entry.senses) > 0 

136 ): 

137 sense = word_entry.senses[-1] 

138 sense.note = node[node.index(":") + 1 :].strip() 

139 sense.note += clean_node( 

140 wxr, sense, list_item.children[index + 1 :] 

141 ) 

142 break 

143 elif ( 

144 isinstance(node, str) 

145 and ":" in node 

146 and node[: node.index(":")].strip() in LINKAGE_SECTIONS 

147 ): 

148 extract_linkage_list_item(wxr, word_entry, list_item, "") 

149 break 

150 else: 

151 if len(word_entry.senses) > 0: 

152 extract_example_list_item( 

153 wxr, word_entry.senses[-1], list_item, word_entry.lang_code 

154 ) 

155 

156 

157def extract_form_of_template( 

158 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

159) -> None: 

160 if "form-of" not in sense.tags: 160 ↛ 162line 160 didn't jump to line 162 because the condition on line 160 was always true

161 sense.tags.append("form-of") 

162 word_arg = 1 if t_node.template_name == "ko-hanja form of" else 2 

163 word = clean_node(wxr, None, t_node.template_parameters.get(word_arg, "")) 

164 if len(word) > 0: 164 ↛ exitline 164 didn't return from function 'extract_form_of_template' because the condition on line 164 was always true

165 sense.form_of.append(AltForm(word=word))