Coverage for src/wiktextract/extractor/ko/page.py: 88%

68 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1import re 

2from typing import Any 

3 

4from mediawiki_langcodes import name_to_code 

5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind 

6 

7from ...page import clean_node 

8from ...wxr_context import WiktextractContext 

9from .etymology import extract_etymology_section 

10from .linkage import extract_linkage_section 

11from .models import Sense, WordEntry 

12from .pos import extract_pos_section 

13from .section_titles import LINKAGE_SECTIONS, POS_DATA 

14from .sound import ( 

15 SOUND_TEMPLATES, 

16 extract_sound_section, 

17 extract_sound_template, 

18) 

19from .translation import extract_translation_section 

20 

21 

22def extract_section_categories( 

23 wxr: WiktextractContext, 

24 page_data: list[WordEntry], 

25 base_data: WordEntry, 

26 level_node: LevelNode, 

27) -> None: 

28 for link_node in level_node.find_child(NodeKind.LINK): 

29 clean_node( 

30 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node 

31 ) 

32 

33 

34def parse_section( 

35 wxr: WiktextractContext, 

36 page_data: list[WordEntry], 

37 base_data: WordEntry, 

38 level_node: LevelNode, 

39) -> None: 

40 title_text = clean_node(wxr, None, level_node.largs) 

41 title_text = re.sub(r"\s*\d+$", "", title_text) 

42 if title_text.removeprefix("보조 ").strip() in POS_DATA: 

43 orig_page_data_len = len(page_data) 

44 extract_pos_section(wxr, page_data, base_data, level_node, title_text) 

45 if ( 

46 len(page_data) == orig_page_data_len 

47 and title_text in LINKAGE_SECTIONS 

48 and len(page_data) > 0 

49 ): # try extract as linkage section 

50 extract_linkage_section( 

51 wxr, page_data[-1], level_node, LINKAGE_SECTIONS[title_text] 

52 ) 

53 elif title_text in LINKAGE_SECTIONS: 

54 extract_linkage_section( 

55 wxr, 

56 page_data[-1] if len(page_data) > 0 else base_data, 

57 level_node, 

58 LINKAGE_SECTIONS[title_text], 

59 ) 

60 elif title_text == "번역": 

61 extract_translation_section( 

62 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

63 ) 

64 elif title_text == "발음": 

65 extract_sound_section( 

66 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

67 ) 

68 elif title_text == "어원": 68 ↛ 72line 68 didn't jump to line 72 because the condition on line 68 was always true

69 extract_etymology_section( 

70 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

71 ) 

72 elif title_text in ["참고 문헌", "독음", "자원"]: 

73 pass # ignore 

74 else: 

75 wxr.wtp.debug(f"unknown title: {title_text}", sortid="ko/page/63") 

76 

77 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

78 parse_section(wxr, page_data, base_data, next_level) 

79 

80 extract_section_categories(wxr, page_data, base_data, level_node) 

81 

82 

83def parse_language_section( 

84 wxr: WiktextractContext, page_data: list[WordEntry], level2_node: LevelNode 

85) -> None: 

86 pre_data_len = len(page_data) 

87 lang_name = clean_node(wxr, None, level2_node.largs) 

88 if lang_name == "": 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true

89 lang_name = "unknown" 

90 lang_code = name_to_code(lang_name, "ko") 

91 if lang_code == "": 

92 lang_code = "unknown" 

93 if ( 93 ↛ 97line 93 didn't jump to line 97

94 wxr.config.capture_language_codes is not None 

95 and lang_code not in wxr.config.capture_language_codes 

96 ): 

97 return 

98 wxr.wtp.start_section(lang_name) 

99 base_data = WordEntry( 

100 word=wxr.wtp.title, 

101 lang_code=lang_code, 

102 lang=lang_name, 

103 pos="unknown", 

104 ) 

105 extract_section_categories(wxr, page_data, base_data, level2_node) 

106 for t_node in level2_node.find_child(NodeKind.TEMPLATE): 

107 if t_node.template_name in SOUND_TEMPLATES: 107 ↛ 106line 107 didn't jump to line 106 because the condition on line 107 was always true

108 extract_sound_template(wxr, base_data, t_node) 

109 

110 for next_level in level2_node.find_child(LEVEL_KIND_FLAGS): 

111 parse_section(wxr, page_data, base_data, next_level) 

112 

113 # no POS section 

114 if len(page_data) == pre_data_len: 

115 extract_pos_section(wxr, page_data, base_data, level2_node, "") 

116 

117 

118def parse_page( 

119 wxr: WiktextractContext, page_title: str, page_text: str 

120) -> list[dict[str, Any]]: 

121 # page layout 

122 # https://ko.wiktionary.org/wiki/위키낱말사전:문서_양식 

123 # https://ko.wiktionary.org/wiki/위키낱말사전:한국어_편집부 

124 wxr.wtp.start_page(page_title) 

125 tree = wxr.wtp.parse(page_text) 

126 page_data: list[WordEntry] = [] 

127 for level2_node in tree.find_child(NodeKind.LEVEL2): 

128 parse_language_section(wxr, page_data, level2_node) 

129 

130 for data in page_data: 

131 if len(data.senses) == 0: 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true

132 data.senses.append(Sense(tags=["no-gloss"])) 

133 return [m.model_dump(exclude_defaults=True) for m in page_data]