Coverage for src/wiktextract/extractor/ko/page.py: 88%

1import re

2from typing import Any

4from mediawiki_langcodes import name_to_code

5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind

7from ...page import clean_node

8from ...wxr_context import WiktextractContext

9from .etymology import extract_etymology_section

10from .linkage import extract_linkage_section

11from .models import Sense, WordEntry

12from .pos import extract_pos_section

13from .section_titles import LINKAGE_SECTIONS, POS_DATA

14from .sound import (

15 SOUND_TEMPLATES,

16 extract_sound_section,

17 extract_sound_template,

18)

19from .translation import extract_translation_section

22def extract_section_categories(

23 wxr: WiktextractContext,

24 page_data: list[WordEntry],

25 base_data: WordEntry,

26 level_node: LevelNode,

27) -> None:

28 for link_node in level_node.find_child(NodeKind.LINK):

29 clean_node(

30 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node

31 )

34def parse_section(

35 wxr: WiktextractContext,

36 page_data: list[WordEntry],

37 base_data: WordEntry,

38 level_node: LevelNode,

39) -> None:

40 title_text = clean_node(wxr, None, level_node.largs)

41 title_text = re.sub(r"\s*\d+$", "", title_text)

42 if title_text.removeprefix("보조 ").strip() in POS_DATA:

43 orig_page_data_len = len(page_data)

44 extract_pos_section(wxr, page_data, base_data, level_node, title_text)

45 if (

46 len(page_data) == orig_page_data_len

47 and title_text in LINKAGE_SECTIONS

48 and len(page_data) > 0

49 ): # try extract as linkage section

50 extract_linkage_section(

51 wxr, page_data[-1], level_node, LINKAGE_SECTIONS[title_text]

52 )

53 elif title_text in LINKAGE_SECTIONS:

54 extract_linkage_section(

55 wxr,

56 page_data[-1] if len(page_data) > 0 else base_data,

57 level_node,

58 LINKAGE_SECTIONS[title_text],

59 )

60 elif title_text == "번역":

61 extract_translation_section(

62 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node

63 )

64 elif title_text == "발음":

65 extract_sound_section(

66 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node

67 )

68 elif title_text == "어원": 68 ↛ 72line 68 didn't jump to line 72 because the condition on line 68 was always true

69 extract_etymology_section(

70 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node

71 )

72 elif title_text in ["참고 문헌", "독음", "자원"]:

73 pass # ignore

74 else:

75 wxr.wtp.debug(f"unknown title: {title_text}", sortid="ko/page/63")

77 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):

78 parse_section(wxr, page_data, base_data, next_level)

80 extract_section_categories(wxr, page_data, base_data, level_node)

83def parse_language_section(

84 wxr: WiktextractContext, page_data: list[WordEntry], level2_node: LevelNode

85) -> None:

86 pre_data_len = len(page_data)

87 lang_name = clean_node(wxr, None, level2_node.largs)

88 if lang_name == "": 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true

89 lang_name = "unknown"

90 lang_code = name_to_code(lang_name, "ko")

91 if lang_code == "":

92 lang_code = "unknown"

93 if ( 93 ↛ 97line 93 didn't jump to line 97

94 wxr.config.capture_language_codes is not None

95 and lang_code not in wxr.config.capture_language_codes

96 ):

97 return

98 wxr.wtp.start_section(lang_name)

99 base_data = WordEntry(

100 word=wxr.wtp.title,

101 lang_code=lang_code,

102 lang=lang_name,

103 pos="unknown",

104 )

105 extract_section_categories(wxr, page_data, base_data, level2_node)

106 for t_node in level2_node.find_child(NodeKind.TEMPLATE):

107 if t_node.template_name in SOUND_TEMPLATES: 107 ↛ 106line 107 didn't jump to line 106 because the condition on line 107 was always true

108 extract_sound_template(wxr, base_data, t_node)

109

110 for next_level in level2_node.find_child(LEVEL_KIND_FLAGS):

111 parse_section(wxr, page_data, base_data, next_level)

112

113 # no POS section

114 if len(page_data) == pre_data_len:

115 extract_pos_section(wxr, page_data, base_data, level2_node, "")

116

117

118def parse_page(

119 wxr: WiktextractContext, page_title: str, page_text: str

120) -> list[dict[str, Any]]:

121 # page layout

122 # https://ko.wiktionary.org/wiki/위키낱말사전:문서_양식

123 # https://ko.wiktionary.org/wiki/위키낱말사전:한국어_편집부

124 wxr.wtp.start_page(page_title)

125 tree = wxr.wtp.parse(page_text)

126 page_data: list[WordEntry] = []

127 for level2_node in tree.find_child(NodeKind.LEVEL2):

128 parse_language_section(wxr, page_data, level2_node)

129

130 for data in page_data:

131 if len(data.senses) == 0: 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true

132 data.senses.append(Sense(tags=["no-gloss"]))

133 return [m.model_dump(exclude_defaults=True) for m in page_data]