master

分支 (1)

管理

管理

master

search-engine
/
DictProducer.cpp

#include "DictProducer.hpp"
#include "Mylogger.hpp"
#include <sys/types.h>
#include <dirent.h>
#include <cstring>
#include <string>
#include <iostream>
#include <fstream>
#include <sstream>
#include <algorithm>
#include <cctype>
#include <climits>
#include <unordered_set>

using std::cout;
using std::endl;
using std::cin;
using std::cerr;
using std::string;
using std::ofstream;
using std::ifstream;        //输入文件对象，是个对象
using std::istringstream;   //输入字符串对象
using std::ios;
using std::unordered_set;
using wd::Mylogger;

/*
 *  读取语料库的路径，获取语料库中所有文件的绝对路径
 *  使用SplitTool工具类获取分词结果
 *  将分词结果插入词库中
 *  构造索引库
 *  将词频库和索引库都写入文件中
*/


string getAbsolutePath(const std::string& path)
{
    char resolvedPath[PATH_MAX];
    if (realpath(path.c_str(), resolvedPath) == nullptr)
    {
        return "";
    }
    return std::string(resolvedPath);
}
static inline bool jumpfile(struct dirent* direntp)
{
    if(strcmp(direntp->d_name,".") == 0 || strcmp(direntp->d_name,"..") == 0
       || strcmp(direntp->d_name ,"stop_words_eng.txt") == 0 ||
       strcmp(direntp->d_name,"stop_words_zh.txt")==0)
    {
        return true;
    }
    else
    {
        return false;
    }
}

//1.静态成员指针初始化，指向生成的唯一对象
std::once_flag DictProducer::_dictflag;     //类对象的默认初始化
DictProducer* DictProducer::_dictInstance = nullptr;
//处理中文
/*读取语料库中的预料，逐个文件分词，避开stop_words_zh.txt和stop_words_eng.txt*/
DictProducer::DictProducer()
    :_file()
    ,_file_en()
    ,_dict()
    ,_dict_en()
    ,_index()
    ,_index_en()
     ,_spt(SplitToolCppJieBa::getInstance())      //复制构造函数
     ,_myLog(*Mylogger::getInstance())
     ,_dictConf(Configuration::getInstance())       //文件配置信息
    ,_dictMap(_dictConf.getPathMap())   //文件映射信息
{
    getFiles(_dictMap["wordpath"]);          //获取语料库中所有文件的绝对路径
    LogInfo("DictProducer构造函数");
}
DictProducer::~DictProducer()                       //都是容器或内置类型，不需要自定义析构
{
    LogInfo("~DictProducer()析构函数");
}
void DictProducer::buildEnDdict()                      //创建英文字典
{
    //遍历容器获取文件
    ifstream enDictFile;
    string line;
    for(auto & file:_file_en)
    {
        //打开英文语料文件
        enDictFile.open(file.c_str(),ios::in);
        if(!enDictFile.is_open())
        {
            LogError("enDictFile couldn't open");
            exit(1);
        }
        //读取一行
        while(getline(enDictFile,line))
        {
            //去重。捕获字符,将非字母字符替换为空字符
            std::replace_if(line.begin(),line.end(),[](char c){  return !isalpha(c); },' ');
            //分词
            istringstream ist(line);    //字符串输入流对象绑定line,从中抽取数据
            string word;
            while(ist >> word)  //按空白字符分隔，抽取单词
            {
                _dict_en[word]++;
            }
        }
        enDictFile.close();     //关闭文件

    }

    //构建英文的索引库 index_en
    for(auto iter = _dict_en.begin();iter != _dict_en.end();++iter)
    {
        /* cout << "pair.first: " << iter->first << " pair.second: " << iter->second<<endl; */
        for(auto iter2 = iter->first.begin();iter2 != iter->first.end();++iter2)
        {
            _index_en[std::toupper(*iter2)].insert(iter->second);
        }
    }

}
void DictProducer::buildCnDict()                      //创建中文字典
{
    //保存停止词到unordered_set容器中，
    //要注意windows下文本文件和linux下文本文件的区别。windows以\r\n结尾，而linux以\n做结尾
    //同时中文的编码问题也要注意，是ascii码还是utf-8的
    ifstream stopWordFile(_dictMap["cn_stopwordpath"]);
    if(!stopWordFile)
    {
        LogError("stopWordFile couldn't open");
        exit(1);
    }
    unordered_set<string> stopWordUnset;
    string text;
    while(getline(stopWordFile,text))  //获取所有的中文停止词
    {
        stopWordUnset.insert(text);

    }

    //遍历容器中的文件并分词
    ifstream cnDictFile;    //空的输入文件对象
    vector<string> tmp; //保存分词结果的临时内存
    string line;        //文件中的行
    for(auto it = _file.begin();it != _file.end();++it)
    {
        cnDictFile.open(*it,ios::in);
        if(!cnDictFile.is_open())  //如果用了open成员函数就不要用!cnDictFile来判断。
        {
            LogError("cnDictFile couldn't open");
            exit(1);
        }
        while(getline(cnDictFile,line))
        {
            //分词
            tmp = _spt.cut(line);
            for(auto & e :tmp)      //将分词结果插入map容器中，map关键字唯一，红黑树实现
            {
                //去除无用的字
                if((stopWordUnset.find(e) != stopWordUnset.end()))
                {
                    continue;
                }
                else
                    _dict[e]++;
            }
        }
        cnDictFile.close();
    }
    //构建中文索引库
    int j = 1;
    for(auto it = _dict.begin();it != _dict.end();++it)  //迭代器指向一个pair对象
    {
        for(string::size_type i = 0; i < it->first.size();)
        {
            string strWord(it->first.substr(i,3));//两个参数的substr的含义是从第一个参数开始，的第二个参数值的字符
            _index[strWord].insert(j);
            i+=3;
        }
        j++;
    }
}
void DictProducer::storeDict()     //将字典写入文件
{
    //生成词频文件路径和索引库文件路径
    string filepath = _dictMap["dictstorepath"];
    string cnWordFre(filepath),cnIndex(filepath),enWordFre(filepath),enIndex(filepath);
    cnWordFre += "/";
    cnWordFre += "cndict.dat";
    cnIndex += "/";
    cnIndex += "cndictIndex.dat";
    enWordFre += "/";
    enWordFre +="endict.dat";
    enIndex += "/";
    enIndex += "endictIndex.dat";
    //建立输出文件对象，关联文件
    ofstream cnWordFreFile(cnWordFre,ios::out),cnIndexFile(cnIndex,ios::out),
             enWordFreFile(enWordFre,ios::out),enIndexFile(enIndex,ios::out);
    if(!cnWordFreFile || !cnIndexFile || !enWordFreFile || !enIndexFile)
    {
        LogError("DictProducer:storeDict couldn\'t open the file");
        return;
    }
    //存储中文词频文件
    for(auto it = _dict.begin();it != _dict.end();++it)
    {
        cnWordFreFile << it->first << " " << it->second << endl;
    }
    LogInfo("DictProducer:storeDict cnWordFreFile stored");
    //索引中文库文件
    for(auto it = _index.begin();it != _index.end();++it)
    {
        cnIndexFile << it->first << " ";
        //set容器不支持修改,也就只有常量迭代器
        for(auto it2 = it->second.cbegin();it2 !=it->second.end();++it2 )
        {
            cnIndexFile << *it2 << " ";
        }
        cnIndexFile << endl;
    }
    LogInfo("DictProducer:storeDict cnIndexFile stored");
    //英文词频库存储
    for(auto it = _dict_en.begin();it != _dict_en.end();++it)
    {
        enWordFreFile << it->first << " " << it->second << endl;
    }
    LogInfo("DictProducer:storeDict enWordFreFile stored");
    //英文索引库存储
    for(auto it = _index_en.begin();it != _index_en.end();++it)
    {
        enIndexFile << it->first << " ";
        //set容器不支持修改，也就只有常量迭代器
        for(auto it2 = it->second.cbegin();it2 != it->second.cend();++it2)
        {
            enIndexFile << *it2 << " ";
        }
        enIndexFile << endl;
    }
    LogInfo("DictProducer:storeDict enIndexFile stored");
    cnWordFreFile.close();
    cnIndexFile.close();
    enWordFreFile.close();
    enIndexFile.close();
}
void DictProducer::showFiles() const                  //查看文件路径，作为测试用途
{
    for(auto iter = _file.cbegin();iter != _file.cend();++iter)
    {
        cout << *iter << endl;
    }
}
void DictProducer::showDict()const                    //查看词典，作为测试用途
{
    cout << "中文词典"  <<endl;
    for(auto iter = _dict.cbegin();iter != _dict.cend();++iter)
    {
        cout << iter->first << ": " << iter->second << endl;
    }
    cout << endl <<"中文Index" << endl;
    for(auto it = _index.begin();it != _index.end();++it)
    {
        cout << it->first << ": ";
        //set容器不支持修改,也就只有常量迭代器
        for(auto it2 = it->second.cbegin();it2 !=it->second.end();++it2 )
        {
            cout << *it2 << " ";
        }
        cout << endl;
    }
    cout << "英文词典"  <<endl;
    for(auto iter = _dict_en.cbegin();iter != _dict_en.cend();++iter)
    {
        cout << iter->first << ": " << iter->second << endl;
    }
    cout << endl <<"英文Index" << endl;
    for(auto it = _index_en.begin();it != _index_en.end();++it)
    {
        cout << it->first << ": ";
        //set容器不支持修改,也就只有常量迭代器
        for(auto it2 = it->second.cbegin();it2 !=it->second.end();++it2 )
        {
            cout << *it2 << " ";
        }
        cout << endl;
    }
}
void DictProducer::getFiles(const string & dir) //输出文件的绝对路径
{
    //打开指定目录
    DIR* dp = opendir(dir.c_str());     //获取目录指针
    if(dp==nullptr)
    {
        LogError("DictProducer::getFiles opendir wrong");
        exit(1);
    }
    struct dirent* direntp;             //目录项指针
    string path = dir;                        //获取语料库中每个文件的绝对路径
    //遍历目录,获取文件的绝对路径
    while((direntp = readdir(dp))!= nullptr)    //readdir会自动移动目录项指针
    {
        //跳过一些目录项,比如. 和 .. 以及一些指定的文件
        if(jumpfile(direntp))
        {
            continue;
        }
        else if(strcmp(direntp->d_name,"The_Holy_Bible.txt")==0||
                strcmp(direntp->d_name,"english.txt") == 0)
        {
            //拼接绝对路径
            path += "/";
            path += direntp->d_name;
            _file_en.push_back(path);
        }
        else
        {
            //拼接绝对路径
            path += "/";
            path += direntp->d_name;

            if(direntp->d_type == DT_DIR)
            {
                getFiles(path);         //若是目录则递归
            }
            else if(direntp->d_type == DT_REG)//若是普通文件则插入容器中
            {
                _file.push_back(string(getAbsolutePath(path)));
            }

        }
        //更新path
        path = dir;
    }
    closedir(dp);               //关闭目录指针
}