Coverage for src/wiktextract/extractor/ku/page.py: 74%

1import string

2from typing import Any

4from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind

6from ...page import clean_node

7from ...wxr_context import WiktextractContext

8from .descendant import extract_descendant_section

9from .etymology import extract_etymology_section

10from .example import extract_example_section

11from .linkage import extract_linkage_section

12from .models import Sense, WordEntry

13from .pos import extract_pos_section

14from .section_titles import LINKAGE_SECTIONS, LINKAGE_TAGS, POS_DATA

15from .sound import extract_sound_section

16from .translation import extract_translation_section, is_translation_page

19def parse_section(

20 wxr: WiktextractContext,

21 page_data: list[WordEntry],

22 base_data: WordEntry,

23 level_node: LevelNode,

24) -> None:

25 title_text = clean_node(wxr, None, level_node.largs)

26 title_text = title_text.rstrip(string.digits + string.whitespace)

27 wxr.wtp.start_subsection(title_text)

28 if title_text in POS_DATA:

29 extract_pos_section(wxr, page_data, base_data, level_node, title_text)

30 if len(page_data[-1].senses) == 0 and title_text in LINKAGE_SECTIONS: 30 ↛ 31line 30 didn't jump to line 31 because the condition on line 30 was never true

31 page_data.pop()

32 extract_linkage_section(

33 wxr,

34 page_data[-1] if len(page_data) > 0 else base_data,

35 level_node,

36 LINKAGE_SECTIONS[title_text],

37 LINKAGE_TAGS.get(title_text, []),

38 )

39 elif title_text == "Etîmolojî": 39 ↛ 40line 39 didn't jump to line 40 because the condition on line 39 was never true

40 extract_etymology_section(

41 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node

42 )

43 elif title_text in ["Werger", "Bi zaravayên din"]:

44 extract_translation_section(

45 wxr,

46 page_data[-1] if len(page_data) > 0 else base_data,

47 level_node,

48 tags=["dialectal"] if title_text == "Bi zaravayên din" else [],

49 )

50 elif title_text in ["Bi alfabeyên din", "Herwiha", "Bide ber"]:

51 extract_linkage_section(

52 wxr,

53 page_data[-1] if len(page_data) > 0 else base_data,

54 level_node,

55 "",

56 )

57 elif title_text in LINKAGE_SECTIONS:

58 extract_linkage_section(

59 wxr,

60 page_data[-1] if len(page_data) > 0 else base_data,

61 level_node,

62 LINKAGE_SECTIONS[title_text],

63 LINKAGE_TAGS.get(title_text, []),

64 )

65 elif title_text == "Bilêvkirin":

66 extract_sound_section(wxr, base_data, level_node)

67 elif title_text in ["Ji wêjeyê", "Ji wêjeya klasîk"]: 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true

68 extract_example_section(

69 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node

70 )

71 elif title_text == "Bikaranîn": 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true

72 extract_note_section(

73 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node

74 )

75 elif title_text == "Dûnde": 75 ↛ 79line 75 didn't jump to line 79 because the condition on line 75 was always true

76 extract_descendant_section(

77 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node

78 )

79 elif title_text not in ["Çavkanî"]:

80 wxr.wtp.debug(f"Unknown title: {title_text}")

82 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):

83 parse_section(wxr, page_data, base_data, next_level)

86def parse_page(

87 wxr: WiktextractContext, page_title: str, page_text: str

88) -> list[dict[str, Any]]:

89 # page layout

90 # https://ku.wiktionary.org/wiki/Wîkîferheng:Normalkirina_gotaran

91 # https://ku.wiktionary.org/wiki/Alîkarî:Formata_nivîsînê

92 if is_translation_page(page_title): 92 ↛ 93line 92 didn't jump to line 93 because the condition on line 92 was never true

93 return []

94 wxr.wtp.start_page(page_title)

95 tree = wxr.wtp.parse(page_text, pre_expand=True)

96 page_data: list[WordEntry] = []

97 for level2_node in tree.find_child(NodeKind.LEVEL2):

98 cats = {}

99 lang_name = clean_node(wxr, cats, level2_node.largs)

100 lang_code = "unknown"

101 for t_node in level2_node.find_content(NodeKind.TEMPLATE):

102 new_lang_code = clean_node(

103 wxr, None, t_node.template_parameters.get(1, "")

104 )

105 if new_lang_code != "": 105 ↛ 101line 105 didn't jump to line 101 because the condition on line 105 was always true

106 lang_code = new_lang_code

107 wxr.wtp.start_section(lang_name)

108 base_data = WordEntry(

109 word=wxr.wtp.title,

110 lang_code=lang_code,

111 lang=lang_name,

112 pos="unknown",

113 categories=cats.get("categories", []),

114 )

115 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS):

116 parse_section(wxr, page_data, base_data, next_level_node)

117

118 for data in page_data:

119 if len(data.senses) == 0: 119 ↛ 120line 119 didn't jump to line 120 because the condition on line 119 was never true

120 data.senses.append(Sense(tags=["no-gloss"]))

121 return [m.model_dump(exclude_defaults=True) for m in page_data]

122

123

124def extract_note_section(

125 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode

126) -> None:

127 for list_node in level_node.find_child(NodeKind.LIST):

128 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

129 note = clean_node(wxr, None, list_item.children)

130 if note != "":

131 word_entry.notes.append(note)