Coverage for src/wiktextract/extractor/ja/page.py: 83%

67 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1import re 

2from typing import Any 

3 

4from mediawiki_langcodes import name_to_code 

5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind 

6 

7from ...page import clean_node 

8from ...wxr_context import WiktextractContext 

9from .conjugation import extract_conjugation_section 

10from .etymology import extract_etymology_section 

11from .linkage import extract_linkage_section 

12from .models import Sense, WordEntry 

13from .pos import parse_pos_section 

14from .section_titles import LINKAGES, POS_DATA 

15from .sound import extract_sound_section 

16from .translation import extract_translation_section 

17 

18 

19def parse_section( 

20 wxr: WiktextractContext, 

21 page_data: list[WordEntry], 

22 base_data: WordEntry, 

23 level_node: LevelNode, 

24) -> None: 

25 title_texts = clean_node(wxr, None, level_node.largs) 

26 for title_text in re.split(r":|:|・", title_texts): 26 ↛ 67line 26 didn't jump to line 67 because the loop on line 26 didn't complete

27 if title_text in POS_DATA: 

28 pre_len = len(page_data) 

29 parse_pos_section(wxr, page_data, base_data, level_node, title_text) 

30 if ( 

31 len(page_data) == pre_len 

32 and title_text in LINKAGES 

33 and pre_len > 0 

34 ): 

35 extract_linkage_section( 

36 wxr, page_data[-1], level_node, LINKAGES[title_text] 

37 ) 

38 break 

39 elif title_text in ["語源", "由来"] and wxr.config.capture_etymologies: 

40 extract_etymology_section(wxr, page_data, base_data, level_node) 

41 break 

42 elif title_text.startswith("発音") and wxr.config.capture_pronunciation: 

43 extract_sound_section(wxr, page_data, base_data, level_node) 

44 break 

45 elif title_text == "翻訳" and wxr.config.capture_translations: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 extract_translation_section( 

47 wxr, 

48 page_data[-1] if len(page_data) > 0 else base_data, 

49 level_node, 

50 ) 

51 break 

52 elif title_text in LINKAGES and wxr.config.capture_linkages: 52 ↛ 60line 52 didn't jump to line 60 because the condition on line 52 was always true

53 extract_linkage_section( 

54 wxr, 

55 page_data[-1] if len(page_data) > 0 else base_data, 

56 level_node, 

57 LINKAGES[title_text], 

58 ) 

59 break 

60 elif title_text == "活用" and wxr.config.capture_inflections: 

61 extract_conjugation_section( 

62 wxr, 

63 page_data[-1] if len(page_data) > 0 else base_data, 

64 level_node, 

65 ) 

66 

67 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

68 parse_section(wxr, page_data, base_data, next_level) 

69 

70 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

71 if t_node.template_name.endswith("-cat"): 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true

72 clean_node( 

73 wxr, page_data[-1] if len(page_data) > 0 else base_data, t_node 

74 ) 

75 

76 

77def parse_page( 

78 wxr: WiktextractContext, page_title: str, page_text: str 

79) -> list[dict[str, Any]]: 

80 # page layout 

81 # https://ja.wiktionary.org/wiki/Wiktionary:スタイルマニュアル 

82 wxr.wtp.start_page(page_title) 

83 tree = wxr.wtp.parse(page_text) 

84 page_data: list[WordEntry] = [] 

85 for level2_node in tree.find_child(NodeKind.LEVEL2): 

86 lang_name = clean_node(wxr, None, level2_node.largs) 

87 lang_code = name_to_code(lang_name, "ja") 

88 if lang_code == "": 

89 for template in level2_node.find_content(NodeKind.TEMPLATE): 

90 if template.template_name == "L": 90 ↛ 91line 90 didn't jump to line 91 because the condition on line 90 was never true

91 lang_code = template.template_parameters.get(1, "") 

92 elif re.fullmatch(r"[a-z-]+", template.template_name): 92 ↛ 89line 92 didn't jump to line 89 because the condition on line 92 was always true

93 lang_code = template.template_name 

94 if lang_code == "": 94 ↛ 95line 94 didn't jump to line 95 because the condition on line 94 was never true

95 lang_code = "unknown" 

96 wxr.wtp.start_section(lang_name) 

97 base_data = WordEntry( 

98 word=wxr.wtp.title, 

99 lang_code=lang_code, 

100 lang=lang_name, 

101 pos="unknown", 

102 ) 

103 for link_node in level2_node.find_child(NodeKind.LINK): 

104 clean_node(wxr, base_data, link_node) 

105 for level3_node in level2_node.find_child(NodeKind.LEVEL3): 

106 parse_section(wxr, page_data, base_data, level3_node) 

107 

108 for data in page_data: 

109 if len(data.senses) == 0: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true

110 data.senses.append(Sense(tags=["no-gloss"])) 

111 return [m.model_dump(exclude_defaults=True) for m in page_data]