Coverage for src/wiktextract/extractor/ku/page.py: 74%

71 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-09 14:03 +0000

1import string 

2from typing import Any 

3 

4from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .descendant import extract_descendant_section 

9from .etymology import extract_etymology_section 

10from .example import extract_example_section 

11from .linkage import extract_linkage_section 

12from .models import Sense, WordEntry 

13from .pos import extract_pos_section 

14from .section_titles import LINKAGE_SECTIONS, LINKAGE_TAGS, POS_DATA 

15from .sound import extract_sound_section 

16from .translation import extract_translation_section, is_translation_page 

17 

18 

19def parse_section( 

20 wxr: WiktextractContext, 

21 page_data: list[WordEntry], 

22 base_data: WordEntry, 

23 level_node: LevelNode, 

24) -> None: 

25 title_text = clean_node(wxr, None, level_node.largs) 

26 title_text = title_text.rstrip(string.digits + string.whitespace) 

27 wxr.wtp.start_subsection(title_text) 

28 if title_text in POS_DATA: 

29 extract_pos_section(wxr, page_data, base_data, level_node, title_text) 

30 if len(page_data[-1].senses) == 0 and title_text in LINKAGE_SECTIONS: 30 ↛ 31line 30 didn't jump to line 31 because the condition on line 30 was never true

31 page_data.pop() 

32 extract_linkage_section( 

33 wxr, 

34 page_data[-1] if len(page_data) > 0 else base_data, 

35 level_node, 

36 LINKAGE_SECTIONS[title_text], 

37 LINKAGE_TAGS.get(title_text, []), 

38 ) 

39 elif title_text == "Etîmolojî": 39 ↛ 40line 39 didn't jump to line 40 because the condition on line 39 was never true

40 extract_etymology_section( 

41 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

42 ) 

43 elif title_text in ["Werger", "Bi zaravayên din"]: 

44 extract_translation_section( 

45 wxr, 

46 page_data[-1] if len(page_data) > 0 else base_data, 

47 level_node, 

48 tags=["dialectal"] if title_text == "Bi zaravayên din" else [], 

49 ) 

50 elif title_text in ["Bi alfabeyên din", "Herwiha", "Bide ber"]: 

51 extract_linkage_section( 

52 wxr, 

53 page_data[-1] if len(page_data) > 0 else base_data, 

54 level_node, 

55 "", 

56 ) 

57 elif title_text in LINKAGE_SECTIONS: 

58 extract_linkage_section( 

59 wxr, 

60 page_data[-1] if len(page_data) > 0 else base_data, 

61 level_node, 

62 LINKAGE_SECTIONS[title_text], 

63 LINKAGE_TAGS.get(title_text, []), 

64 ) 

65 elif title_text == "Bilêvkirin": 

66 extract_sound_section(wxr, base_data, level_node) 

67 elif title_text in ["Ji wêjeyê", "Ji wêjeya klasîk"]: 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true

68 extract_example_section( 

69 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

70 ) 

71 elif title_text == "Bikaranîn": 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true

72 extract_note_section( 

73 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

74 ) 

75 elif title_text == "Dûnde": 75 ↛ 79line 75 didn't jump to line 79 because the condition on line 75 was always true

76 extract_descendant_section( 

77 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

78 ) 

79 elif title_text not in ["Çavkanî"]: 

80 wxr.wtp.debug(f"Unknown title: {title_text}") 

81 

82 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

83 parse_section(wxr, page_data, base_data, next_level) 

84 

85 

86def parse_page( 

87 wxr: WiktextractContext, page_title: str, page_text: str 

88) -> list[dict[str, Any]]: 

89 # page layout 

90 # https://ku.wiktionary.org/wiki/Wîkîferheng:Normalkirina_gotaran 

91 # https://ku.wiktionary.org/wiki/Alîkarî:Formata_nivîsînê 

92 if is_translation_page(page_title): 92 ↛ 93line 92 didn't jump to line 93 because the condition on line 92 was never true

93 return [] 

94 wxr.wtp.start_page(page_title) 

95 tree = wxr.wtp.parse(page_text, pre_expand=True) 

96 page_data: list[WordEntry] = [] 

97 for level2_node in tree.find_child(NodeKind.LEVEL2): 

98 cats = {} 

99 lang_name = clean_node(wxr, cats, level2_node.largs) 

100 lang_code = "unknown" 

101 for t_node in level2_node.find_content(NodeKind.TEMPLATE): 

102 new_lang_code = clean_node( 

103 wxr, None, t_node.template_parameters.get(1, "") 

104 ) 

105 if new_lang_code != "": 105 ↛ 101line 105 didn't jump to line 101 because the condition on line 105 was always true

106 lang_code = new_lang_code 

107 wxr.wtp.start_section(lang_name) 

108 base_data = WordEntry( 

109 word=wxr.wtp.title, 

110 lang_code=lang_code, 

111 lang=lang_name, 

112 pos="unknown", 

113 categories=cats.get("categories", []), 

114 ) 

115 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS): 

116 parse_section(wxr, page_data, base_data, next_level_node) 

117 

118 for data in page_data: 

119 if len(data.senses) == 0: 119 ↛ 120line 119 didn't jump to line 120 because the condition on line 119 was never true

120 data.senses.append(Sense(tags=["no-gloss"])) 

121 return [m.model_dump(exclude_defaults=True) for m in page_data] 

122 

123 

124def extract_note_section( 

125 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

126) -> None: 

127 for list_node in level_node.find_child(NodeKind.LIST): 

128 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

129 note = clean_node(wxr, None, list_item.children) 

130 if note != "": 

131 word_entry.notes.append(note)