master

分支 (1)

管理

管理

master

lda_pagerank_doc2vec_model
/
text2stemedwords.py

# encoding=utf-8
"""
Created on 2019年3月1日

@author: yuqi
"""
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

type = '低关注'

if __name__ == '__main__':
    with open('data/%s_title.txt' % type, 'w', encoding='utf-8') as op:
        with open('data/%s.txt' % type, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                fields = line.split('\t')
                title = fields[1].lower()
                abstract = ''
                if len(fields) == 3:
                    abstract = fields[2].lower()
                # print(title)
                # print(abstract)
                words_abstract = [word for word in nltk.word_tokenize(title)]
                # print(words_abstract)
                english_stopwords = stopwords.words('english')
                words_abstract_filtered = [word for word in words_abstract if word not in english_stopwords]
                # print(words_abstract_filtered)
                english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '-']
                words_abstract_filtered = [word for word in words_abstract_filtered if word not in english_punctuations]
                # print(words_abstract_filtered)
                st = PorterStemmer()
                words_abstract_stemed = [st.stem(word) for word in words_abstract_filtered]
                # print(words_abstract_stemed)
                op.write('%s\n' % ' '.join(words_abstract_stemed))