Coverage for src/wiktextract/extractor/cs/page.py: 86%

51 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-12 08:27 +0000

1import re 

2from typing import Any 

3 

4from mediawiki_langcodes import name_to_code 

5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind 

6 

7from ...page import clean_node 

8from ...wxr_context import WiktextractContext 

9from .linkage import extract_alt_form_section 

10from .models import Sense, WordEntry 

11from .pos import extract_pos_section, extract_sense_section 

12from .section_titles import POS_DATA 

13from .sound import extract_hyphenation_section, extract_sound_section 

14 

15 

16def parse_section( 

17 wxr: WiktextractContext, 

18 page_data: list[WordEntry], 

19 base_data: WordEntry, 

20 level_node: LevelNode, 

21): 

22 subtitle = clean_node(wxr, None, level_node.largs) 

23 subtitle = re.sub(r"\(\d+\)", "", subtitle).strip() 

24 if subtitle in POS_DATA and level_node.contain_node(LEVEL_KIND_FLAGS): 

25 extract_pos_section(wxr, page_data, base_data, level_node, subtitle) 

26 elif subtitle == "význam" and len(page_data) > 0: 

27 extract_sense_section(wxr, page_data[-1], level_node) 

28 elif subtitle == "výslovnost": 

29 extract_sound_section(wxr, base_data, level_node) 

30 elif subtitle == "dělení": 

31 extract_hyphenation_section(wxr, base_data, level_node) 

32 elif subtitle == "etymologie": 

33 base_data.etymology_text = clean_node( 

34 wxr, base_data, list(level_node.invert_find_child(LEVEL_KIND_FLAGS)) 

35 ) 

36 elif subtitle == "varianty": 36 ↛ 38line 36 didn't jump to line 38 because the condition on line 36 was always true

37 extract_alt_form_section(wxr, base_data, level_node) 

38 elif subtitle not in ["externí odkazy"]: 

39 wxr.wtp.debug(f"Unknown title: {subtitle}", sortid="cs/page/27") 

40 

41 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

42 parse_section(wxr, page_data, base_data, next_level) 

43 

44 for link_node in level_node.find_child(NodeKind.LINK): 

45 clean_node( 

46 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node 

47 ) 

48 

49 

50def parse_page( 

51 wxr: WiktextractContext, page_title: str, page_text: str 

52) -> list[dict[str, Any]]: 

53 # page layout 

54 # https://cs.wiktionary.org/wiki/Wikislovník:Formát_hesla 

55 wxr.wtp.start_page(page_title) 

56 tree = wxr.wtp.parse(page_text) 

57 page_data = [] 

58 for level2_node in tree.find_child(NodeKind.LEVEL2): 

59 lang_name = clean_node(wxr, None, level2_node.largs) or "unknown" 

60 if lang_name in ["poznámky", "externí odkazy"]: 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true

61 continue 

62 lang_code = name_to_code(lang_name, "cs") or "unknown" 

63 if ( 63 ↛ 67line 63 didn't jump to line 67 because the condition on line 63 was never true

64 wxr.config.capture_language_codes is not None 

65 and lang_code not in wxr.config.capture_language_codes 

66 ): 

67 continue 

68 wxr.wtp.start_section(lang_name) 

69 base_data = WordEntry( 

70 word=wxr.wtp.title, 

71 lang_code=lang_code, 

72 lang=lang_name, 

73 pos="unknown", 

74 ) 

75 for next_level in level2_node.find_child(LEVEL_KIND_FLAGS): 

76 parse_section(wxr, page_data, base_data, next_level) 

77 

78 for data in page_data: 

79 if len(data.senses) == 0: 79 ↛ 80line 79 didn't jump to line 80 because the condition on line 79 was never true

80 data.senses.append(Sense(tags=["no-gloss"])) 

81 

82 return [d.model_dump(exclude_defaults=True) for d in page_data]