main

分支 (1)

管理

管理

main

wordcloud
/
count.py

import os
import json
import jieba

from tqdm import tqdm
from collections import defaultdict

from count_chi import OUTPUT_DIR as INPUT_DIR

print(f'{INPUT_DIR = }')

def main():
    for fileName in filter(
        lambda name: name.startswith('split-') and name.endswith('.jsonl'),
        os.listdir(INPUT_DIR),
    ):
        name = '.'.join(fileName.split('.')[: -1]).replace('split-', '')
        fileName = os.path.join(INPUT_DIR, fileName)
        info = defaultdict(int)
        with open(fileName, 'r', encoding = 'utf-8') as file:
            for line in tqdm(file, fileName):
                obj = json.loads(line)
                content = obj.get('content')
                for word in jieba.cut(content):
                    info[word.lower().strip()] += 1
        with open(os.path.join(INPUT_DIR, f'info-{name}.json'), 'w', encoding = 'utf-8') as file:
            json.dump(info, file, ensure_ascii = False)

if __name__ == '__main__':
    main()