Coverage for src/wiktextract/extractor/ms/page.py: 65%

1import string

2from typing import Any

4from mediawiki_langcodes import name_to_code

5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind

7from ...page import clean_node

8from ...wxr_context import WiktextractContext

9from .linkage import extract_form_section, extract_linkage_section

10from .models import Sense, WordEntry

11from .pos import extract_pos_section

12from .section_titles import FORM_SECTIONS, LINKAGE_SECTIONS, POS_DATA

13from .sound import extract_sound_section

14from .translation import extract_translation_section

17def parse_section(

18 wxr: WiktextractContext,

19 page_data: list[WordEntry],

20 base_data: WordEntry,

21 level_node: LevelNode,

22) -> None:

23 title_text = clean_node(wxr, None, level_node.largs)

24 wxr.wtp.start_subsection(title_text)

25 title_text = title_text.rstrip(string.digits + string.whitespace + "IVX")

26 lower_title = title_text.lower()

27 if lower_title in POS_DATA:

28 old_data_len = len(page_data)

29 extract_pos_section(wxr, page_data, base_data, level_node, title_text)

30 if len(page_data) == old_data_len and lower_title in LINKAGE_SECTIONS:

31 extract_linkage_section(wxr, page_data, base_data, level_node)

32 elif lower_title == "etimologi":

33 extract_etymology_section(wxr, page_data, base_data, level_node)

34 elif lower_title in FORM_SECTIONS:

35 extract_form_section(

36 wxr,

37 page_data[-1] if len(page_data) > 0 else base_data,

38 level_node,

39 FORM_SECTIONS[lower_title],

40 )

41 elif lower_title == "tesaurus" or lower_title in LINKAGE_SECTIONS:

42 extract_linkage_section(wxr, page_data, base_data, level_node)

43 elif lower_title == "terjemahan":

44 extract_translation_section(wxr, page_data, base_data, level_node)

45 elif lower_title == "sebutan": 45 ↛ 47line 45 didn't jump to line 47 because the condition on line 45 was always true

46 extract_sound_section(wxr, page_data, base_data, level_node)

47 elif lower_title in ["nota penggunaan", "penggunaan"]:

48 extract_note_section(

49 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node

50 )

51 elif lower_title not in [

52 "pautan luar",

53 "rujukan",

54 "bacaan lanjut",

55 "lihat juga",

56 ]:

57 wxr.wtp.debug(f"Unknown section: {title_text}", sortid="ms/page/44")

59 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):

60 parse_section(wxr, page_data, base_data, next_level)

61 for link_node in level_node.find_child(NodeKind.LINK): 61 ↛ 62line 61 didn't jump to line 62 because the loop on line 61 never started

62 clean_node(

63 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node

64 )

65 for t_node in level_node.find_child(NodeKind.TEMPLATE):

66 if t_node.template_name in ["topik", "C", "topics"]: 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true

67 clean_node(

68 wxr, page_data[-1] if len(page_data) > 0 else base_data, t_node

69 )

72def parse_page(

73 wxr: WiktextractContext, page_title: str, page_text: str

74) -> list[dict[str, Any]]:

75 # Page format

76 # https://ms.wiktionary.org/wiki/Wikikamus:Memulakan_laman_baru#Format_laman

77 if page_title.startswith(("Portal:", "Reconstruction:")): 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true

78 return []

79 wxr.wtp.start_page(page_title)

80 tree = wxr.wtp.parse(page_text, pre_expand=True)

81 page_data: list[WordEntry] = []

83 for level2_node in tree.find_child(NodeKind.LEVEL2):

84 pre_data_len = len(page_data)

85 lang_name = clean_node(wxr, None, level2_node.largs)

86 lang_code = (

87 name_to_code(lang_name.removeprefix("Bahasa "), "ms") or "unknown"

88 )

89 wxr.wtp.start_section(lang_name)

90 base_data = WordEntry(

91 word=wxr.wtp.title,

92 lang_code=lang_code,

93 lang=lang_name,

94 pos="unknown",

95 )

96 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS):

97 parse_section(wxr, page_data, base_data, next_level_node)

98 if len(page_data) == pre_data_len:

99 page_data.append(base_data.model_copy(deep=True))

100

101 for data in page_data:

102 if len(data.senses) == 0:

103 data.senses.append(Sense(tags=["no-gloss"]))

104 return [m.model_dump(exclude_defaults=True) for m in page_data]

105

106

107def extract_etymology_section(

108 wxr: WiktextractContext,

109 page_data: list[WordEntry],

110 base_data: WordEntry,

111 level_node: LevelNode,

112) -> None:

113 cats = {}

114 e_text = clean_node(

115 wxr, cats, list(level_node.invert_find_child(LEVEL_KIND_FLAGS))

116 )

117 if e_text == "": 117 ↛ 118line 117 didn't jump to line 118 because the condition on line 117 was never true

118 return

119 if len(page_data) == 0 or page_data[-1].lang_code != base_data.lang_code: 119 ↛ 122line 119 didn't jump to line 122 because the condition on line 119 was always true

120 base_data.etymology_text = e_text

121 base_data.categories.extend(cats.get("categories", []))

122 elif level_node.kind == NodeKind.LEVEL3:

123 for data in page_data:

124 if data.lang_code == page_data[-1].lang_code:

125 data.etymology_text = e_text

126 data.categories.extend(cats.get("categories", []))

127 else:

128 page_data[-1].etymology_text = e_text

129 page_data[-1].categories.extend(cats.get("categories", []))

130

131

132def extract_note_section(

133 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode

134) -> None:

135 has_list = False

136 for list_node in level_node.find_child(NodeKind.LIST):

137 has_list = True

138 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

139 note = clean_node(wxr, None, list_item.children)

140 if note != "":

141 word_entry.notes.append(note)

142 if not has_list:

143 note = clean_node(wxr, None, level_node.children)

144 if note != "":

145 word_entry.notes.append(note)