Coverage for src/wiktextract/extractor/cs/page.py: 85%

1import re

2from typing import Any

4from mediawiki_langcodes import name_to_code

5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind

7from ...page import clean_node

8from ...wxr_context import WiktextractContext

9from .declension import extract_declension_section

10from .etymology import extract_etymology_section

11from .linkage import extract_alt_form_section, extract_linkage_section

12from .models import Sense, WordEntry

13from .pos import (

14 extract_note_section,

15 extract_pos_section,

16 extract_sense_section,

17)

18from .section_titles import LINKAGE_SECTIONS, POS_DATA

19from .sound import (

20 extract_homophone_section,

21 extract_hyphenation_section,

22 extract_sound_section,

23 extract_transcript_section,

24)

25from .translation import extract_translation_section

28def parse_section(

29 wxr: WiktextractContext,

30 page_data: list[WordEntry],

31 base_data: WordEntry,

32 level_node: LevelNode,

33):

34 subtitle = clean_node(wxr, None, level_node.largs)

35 subtitle = re.sub(r"\(\d+\)", "", subtitle).strip()

36 if "/" in subtitle: 36 ↛ 37line 36 didn't jump to line 37 because the condition on line 36 was never true

37 subtitle = subtitle.split("/")[0].strip()

38 if subtitle in POS_DATA:

39 extract_pos_section(wxr, page_data, base_data, level_node, subtitle)

40 elif subtitle == "význam" and len(page_data) > 0:

41 extract_sense_section(wxr, page_data[-1], level_node)

42 elif subtitle == "výslovnost":

43 extract_sound_section(wxr, base_data, level_node)

44 elif subtitle == "dělení":

45 extract_hyphenation_section(wxr, base_data, level_node)

46 elif subtitle == "etymologie":

47 extract_etymology_section(

48 wxr,

49 page_data[-1]

50 if level_node.kind != NodeKind.LEVEL3 and len(page_data) > 0

51 else base_data,

52 level_node,

53 )

54 elif subtitle in ["varianty", "varianta zápisu", "varianty zápisu"]:

55 extract_alt_form_section(

56 wxr,

57 page_data[-1]

58 if level_node.kind == NodeKind.LEVEL4

59 and len(page_data) > 0

60 and base_data.lang == page_data[-1].lang

61 else base_data,

62 level_node,

63 )

64 elif subtitle == "překlady":

65 extract_translation_section(

66 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node

67 )

68 elif subtitle in LINKAGE_SECTIONS:

69 extract_linkage_section(

70 wxr,

71 page_data[-1] if len(page_data) > 0 else base_data,

72 level_node,

73 LINKAGE_SECTIONS[subtitle],

74 )

75 elif subtitle in ["stupňování", "časování"] or subtitle.startswith(

76 "skloňování"

77 ):

78 extract_declension_section(

79 wxr,

80 page_data[-1] if len(page_data) > 0 else base_data,

81 level_node,

82 subtitle,

83 )

84 elif subtitle == "homofony": 84 ↛ 85line 84 didn't jump to line 85 because the condition on line 84 was never true

85 extract_homophone_section(wxr, base_data, level_node)

86 elif subtitle == "přepis": 86 ↛ 90line 86 didn't jump to line 90 because the condition on line 86 was always true

87 extract_transcript_section(

88 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node

89 )

90 elif subtitle == "poznámka k užití":

91 extract_note_section(

92 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node

93 )

94 elif subtitle not in ["externí odkazy", "poznámky", "reference"]:

95 wxr.wtp.debug(f"Unknown title: {subtitle}", sortid="cs/page/27")

97 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):

98 parse_section(wxr, page_data, base_data, next_level)

100 for link_node in level_node.find_child(NodeKind.LINK):

101 clean_node(

102 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node

103 )

104

105

106def parse_page(

107 wxr: WiktextractContext, page_title: str, page_text: str

108) -> list[dict[str, Any]]:

109 # page layout

110 # https://cs.wiktionary.org/wiki/Wikislovník:Formát_hesla

111 wxr.wtp.start_page(page_title)

112 tree = wxr.wtp.parse(page_text)

113 page_data = []

114 for level2_node in tree.find_child(NodeKind.LEVEL2):

115 lang_name = clean_node(wxr, None, level2_node.largs) or "unknown"

116 if lang_name in ["poznámky", "externí odkazy"]: 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true

117 continue

118 lang_code = name_to_code(lang_name, "cs") or "unknown"

119 if ( 119 ↛ 123line 119 didn't jump to line 123 because the condition on line 119 was never true

120 wxr.config.capture_language_codes is not None

121 and lang_code not in wxr.config.capture_language_codes

122 ):

123 continue

124 wxr.wtp.start_section(lang_name)

125 base_data = WordEntry(

126 word=wxr.wtp.title,

127 lang_code=lang_code,

128 lang=lang_name,

129 pos="unknown",

130 )

131 for next_level in level2_node.find_child(LEVEL_KIND_FLAGS):

132 parse_section(wxr, page_data, base_data, next_level)

133

134 for data in page_data:

135 if len(data.senses) == 0:

136 data.senses.append(Sense(tags=["no-gloss"]))

137

138 return [d.model_dump(exclude_defaults=True) for d in page_data]

Coverage for src / wiktextract / extractor / cs / page.py: 85%

68 statements