Coverage for src/wiktextract/extractor/tr/page.py: 93%

60 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1import string 

2from typing import Any 

3 

4from mediawiki_langcodes import name_to_code 

5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind 

6 

7from ...page import clean_node 

8from ...wxr_context import WiktextractContext 

9from .etymology import extract_etymology_section 

10from .inflection import extract_inflection_section 

11from .linkage import extract_linkage_section 

12from .models import Sense, WordEntry 

13from .pos import extract_note_section, extract_pos_section 

14from .section_titles import LINKAGE_SECTIONS, LINKAGE_TAGS, POS_DATA 

15from .sound import extract_sound_section 

16from .translation import extract_translation_section 

17 

18 

19def parse_section( 

20 wxr: WiktextractContext, 

21 page_data: list[WordEntry], 

22 base_data: WordEntry, 

23 level_node: LevelNode, 

24) -> None: 

25 title_text = clean_node(wxr, None, level_node.largs) 

26 wxr.wtp.start_subsection(title_text) 

27 title_text = title_text.rstrip(string.digits + string.whitespace) 

28 if title_text in POS_DATA: 

29 extract_pos_section(wxr, page_data, base_data, level_node, title_text) 

30 if len(page_data[-1].senses) == 0 and title_text in LINKAGE_SECTIONS: 

31 page_data.pop() 

32 extract_linkage_section( 

33 wxr, 

34 page_data[-1] if len(page_data) > 0 else base_data, 

35 level_node, 

36 LINKAGE_SECTIONS[title_text], 

37 LINKAGE_TAGS.get(title_text, []), 

38 ) 

39 elif title_text == "Köken": 

40 if level_node.contain_node(LEVEL_KIND_FLAGS): 

41 base_data = base_data.model_copy(deep=True) 

42 extract_etymology_section(wxr, base_data, level_node) 

43 elif title_text in ["Söyleniş", "Heceleme", "Söyleyiş"]: 

44 if level_node.contain_node(LEVEL_KIND_FLAGS): 

45 base_data = base_data.model_copy(deep=True) 

46 extract_sound_section( 

47 wxr, 

48 page_data[-1] 

49 if len(page_data) > 0 

50 and page_data[-1].lang_code == base_data.lang_code 

51 and not level_node.contain_node(LEVEL_KIND_FLAGS) 

52 else base_data, 

53 level_node, 

54 ) 

55 elif title_text == "Çeviriler": 

56 extract_translation_section( 

57 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

58 ) 

59 elif title_text in LINKAGE_SECTIONS: 

60 extract_linkage_section( 

61 wxr, 

62 page_data[-1] if len(page_data) > 0 else base_data, 

63 level_node, 

64 LINKAGE_SECTIONS[title_text], 

65 LINKAGE_TAGS.get(title_text, []), 

66 ) 

67 elif title_text == "Açıklamalar": 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true

68 extract_note_section( 

69 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

70 ) 

71 elif title_text == "Çekimleme": 71 ↛ 75line 71 didn't jump to line 75 because the condition on line 71 was always true

72 extract_inflection_section( 

73 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

74 ) 

75 elif title_text not in [ 

76 "Kaynakça", 

77 "Ek okumalar", 

78 "Kaynaklar", 

79 "Dış Bağlantılar", 

80 ]: 

81 wxr.wtp.debug( 

82 f"Unknown section: {title_text}", 

83 sortid="extractor/tr/page/parse_section/70", 

84 ) 

85 

86 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

87 parse_section(wxr, page_data, base_data, next_level) 

88 

89 for link_node in level_node.find_child(NodeKind.LINK): 

90 clean_node( 

91 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node 

92 ) 

93 

94 

95def parse_page( 

96 wxr: WiktextractContext, page_title: str, page_text: str 

97) -> list[dict[str, Any]]: 

98 # page layout 

99 # https://tr.wiktionary.org/wiki/Vikisözlük:Girdilerin_biçimi 

100 wxr.wtp.start_page(page_title) 

101 tree = wxr.wtp.parse(page_text, pre_expand=True) 

102 page_data: list[WordEntry] = [] 

103 for level2_node in tree.find_child(NodeKind.LEVEL2): 

104 lang_name = clean_node(wxr, None, level2_node.largs) 

105 lang_code = name_to_code(lang_name, "id") or "unknown" 

106 wxr.wtp.start_section(lang_name) 

107 base_data = WordEntry( 

108 word=wxr.wtp.title, 

109 lang_code=lang_code, 

110 lang=lang_name, 

111 pos="unknown", 

112 ) 

113 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS): 

114 parse_section(wxr, page_data, base_data, next_level_node) 

115 

116 for data in page_data: 

117 if len(data.senses) == 0: 

118 data.senses.append(Sense(tags=["no-gloss"])) 

119 return [m.model_dump(exclude_defaults=True) for m in page_data]