本部分记录如何利用Python进行分词工具集成,集成工具可以实现运行无环境要求,同时也更方便。
该文章主要是记录,知识点不是特别多,欢迎访问个人博客:https://blog.jiumoz.top/archives/fen-ci-gong-ju-ji-cheng
成品展示
软件链接:https://cloud.189.cn/web/share?code=BN3yYvIJfUfq(访问码:vvw4)
大家要是想体验的话,就下载了试试吧,有点大,主要是pyqt5太大了,好几百兆…
用的是天翼云,百度云盘有众所周知的原因,阿里云盘不能分享压缩包…
- 软件包含分词、词性标注、自定义停用词表、文件导出等功能,但是也依旧不够智能,比如不能自己设置很多参数、文件保存的格式…
工具介绍
都是python工具包,pip安装就行。
- GUI界面主要构成是PyQt5
- 核心功能是分词是jieba
- 打包有很多方式,这里使用两种方式,一种是利用cx_Freeze;另一种是pyinstaller;
开始简单的试验
首先实现简单的手工输入语句并完成分词与输出
主要的代码编写
- 借Python实现简单GUI程序中相关的内容,我们直接修改相关内容确定最后的窗体页面:
- 关键代码,借jieba分词中的内容,我们导入
jieba
包后直接集成,主要函数代码如下:
def cut(self):
self.equal.clear()
print(self.first.toPlainText())
text = self.first.toPlainText()
words = jieba.lcut(text)
print(words)
word = ""
for i in words:
word = word+" "+i
self.equal.append(word)
- 测试效果:
- 完整代码:
# -*- coding: utf-8 -*-
# @Time : 2022/5/1 11:52
# @Author : MinChess
# @File : test2.py
# @Software: PyCharm
import sys
from PyQt5 import QtCore, QtGui, QtWidgets
from PyQt5.QtWidgets import *
from PyQt5.QtGui import *
from PyQt5.QtCore import *
import jieba
class test(QWidget):
def __init__(self):
super().__init__()
self.initUI()
def initUI(self):
self.setWindowTitle("test")
self.show()
self.resize(1200,900)
self.setMinimumSize(1200,900)
alllayout = QVBoxLayout()
vlayout = QHBoxLayout()
vlayout2 = QHBoxLayout()
self.addbtn = QPushButton("开始处理")
self.addbtn.setFixedHeight(66)
self.addbtn.clicked.connect(self.cut)
self.addbtn.setStyleSheet("font-size:36px;")
self.first = QTextEdit()
self.equal = QTextEdit()
self.equal.setReadOnly(True)
vlayout.addWidget(self.addbtn)
vlayout2.addWidget(self.first)
vlayout2.addWidget(self.equal)
alllayout.addLayout(vlayout)
alllayout.addLayout(vlayout2)
self.setLayout(alllayout)
def cut(self):
self.equal.clear()
print(self.first.toPlainText())
text = self.first.toPlainText()
words = jieba.lcut(text)
print(words)
word = ""
for i in words:
word = word+" "+i
self.equal.append(word)
if __name__ == '__main__':
app = QApplication(sys.argv)
ex = test()
ex.show()
sys.exit(app.exec_())
集成为exe
这里主要介绍cx_Freeze集成的方法
pip install cx_freeze
安装打包的库- 编写配置文件
# -*- coding: utf-8 -*-
# @Time : 2021/12/1 20:44
# @Author : MinChess
# @File : setup.py
# @Software: PyCharm
import sys
from cx_Freeze import setup, Executable
build_exe_options = {"packages": ["os"]}
base = None
if sys.platform == "win32":
base = "Win32GUI"
setup(name="九陌斋分词",
version="0.1",
description="My GUI application!",
options={"build_exe": build_exe_options},
executables=[Executable("xxx.py", base=base)])
- 到该目录下通过
python setup.py build
运行该程序即可完成集成
完整分词工具的编写
- 首先得确定方案与路线
- 首先得有GUI界面:得有按钮 得有显示字符串的框…
- 核心代码:有了界面就得思考如何实现了,比如如何获取文件 如何输出 如何执行分词…
- 事件通信:事件通信就是思考如何将事件绑定起来,也就是点击对应按钮执行我们想要的功能
- 测试与集成:写完代码就需要不断的测试,直到没有bug再集成成为EXE文件
- 这里不做详细的代码拆分介绍,贴出完整代码如下,资料包点击链接即可获取
代码包(主程序、集成程序、图标文件、停用词表):https://cloud.189.cn/web/share?code=ZBZvqeBBz6Jb(访问码:0wri)
# -*- coding: utf-8 -*-
# @Time : 2021/12/1 20:44
# @Author : MinChess
# @File : jieba_cut.py
# @Software: PyCharm
import sys
from PyQt5 import QtCore, QtGui, QtWidgets
from PyQt5.QtWidgets import *
from PyQt5.QtGui import *
from PyQt5.QtCore import *
import re
import jieba
import jieba.posseg
from collections import Counter
class Jieba_Main_Window(QWidget):
sig = pyqtSignal()
def __init__(self):
super().__init__()
self.initUI()
sys.stdout = Stream(newText=self.onUpdateText)
self.timer = QTimer(self)
self.timer.timeout.connect(self.__openByIODevice)
def onUpdateText(self, text):
cursor = self.info_content.textCursor()
cursor.movePosition(QTextCursor.End)
cursor.insertText(text)
self.info_content.setTextCursor(cursor)
self.info_content.ensureCursorVisible()
def initUI(self):
self.setWindowTitle('九陌斋-Jieba分词')
self.setWindowIcon(QIcon('favicon.ico'))
self.show()
self.resize(1200, 900) # 宽×高
self.setMinimumSize(1200, 900)
alllayout = QHBoxLayout()
vlistsum = QHBoxLayout()
vlistlayout = QVBoxLayout()
v00layout = QVBoxLayout()
v01layout = QVBoxLayout()
v10layout = QVBoxLayout()
v11layout = QVBoxLayout()
v0layout = QVBoxLayout()
v1layout = QVBoxLayout()
self.choose_file = QPushButton("文件选择")
self.choose_file.setStyleSheet('''QPushButton{font-size:18px;color:white;font-weight:bold;}QPushButton{background:#2ca9e1;border-radius:5px;}QPushButton:hover{background:#84a2d4;}''')
self.choose_file.setFixedHeight(32)
self.choose_file.clicked.connect(self.on_actQFile_Open_triggered)
self.choose_save = QPushButton("文件保存")
self.choose_save.clicked.connect(self.QFile_Save)
self.choose_save.setStyleSheet(
'''QPushButton{font-size:18px;color:white;font-weight:bold;}QPushButton{background:#2ca9e1;border-radius:5px;}QPushButton:hover{background:#84a2d4;}''')
self.choose_save.setFixedHeight(32)
# self.choose_save.clicked.connect(self.QFile_Save)
self.start_clean = QPushButton("开始处理")
self.start_clean.clicked.connect(self.jieba_cut)
self.start_clean.setStyleSheet('''QPushButton{font-size:18px;color:white;font-weight:bold;}QPushButton{background:#2ca9e1;border-radius:5px;}QPushButton:hover{background:#84a2d4;}''')
self.start_clean.setFixedHeight(32)
self.cut_word_list = QPushButton("默认列表")
self.cut_word_list.clicked.connect(self.default_cut_list)
self.cut_word_list.setStyleSheet(
'''QPushButton{font-size:18px;color:white;font-weight:bold;}QPushButton{background:#2ca9e1;border-radius:5px;}QPushButton:hover{background:#84a2d4;}''')
self.cut_word_list.setFixedHeight(32)
self.default_clean = QPushButton("帮助文档")
self.default_clean.setStyleSheet('''QPushButton{font-size:18px;color:white;font-weight:bold;}QPushButton{background:#2ca9e1;border-radius:5px;}QPushButton:hover{background:#84a2d4;}''')
self.default_clean.setFixedHeight(32)
self.default_clean.clicked.connect(self.help_info)
self.file_name = QLabel('文件名称:')
self.file_name_path = QLineEdit()
self.lab_rules = QLabel('文件内容:')
self.rules_content = QTextEdit()
self.stop_words_rules = QLabel('停用词列表:')
self.stop_words_content = QTextEdit()
self.file_name_path.setReadOnly(True)
self.file_content_name = QLabel('分词结果:')
self.file_content = QTextEdit()
self.save_file_fenci = QPushButton("分词结果保存")
self.save_file_fenci.clicked.connect(self.on_actQFile_Save_triggered)
self.save_file_fenci.setStyleSheet(
'''QPushButton{font-size:18px;color:white;font-weight:bold;}QPushButton{background:#2ca9e1;border-radius:5px;}QPushButton:hover{background:#84a2d4;}''')
self.save_file_fenci.setFixedHeight(36)
self.lab_info = QLabel('信息输出(输出系统提示信息):')
self.info_content = QTextEdit()
self.info_content.setReadOnly(True)
self.info_content.setStyleSheet("font-size:18px;color:#003399")
self.lab_finish = QLabel('词性标注结果(不建议使用文件保存,各方面问题还在解决中):')
self.finishi_content = QTextEdit()
self.save_file = QPushButton("词性标注结果保存")
self.save_file.clicked.connect(self.on_actQFile_Save_triggered2)
self.save_file.setStyleSheet('''QPushButton{font-size:18px;color:white;font-weight:bold;}QPushButton{background:#2ca9e1;border-radius:5px;}QPushButton:hover{background:#84a2d4;}''')
self.save_file.setFixedHeight(36)
v00layout.addWidget(self.choose_file)
v00layout.addWidget(self.choose_save)
v00layout.addWidget(self.start_clean)
v00layout.addWidget(self.cut_word_list)
v00layout.addWidget(self.default_clean)
v00layout.addWidget(self.file_name)
v00layout.addWidget(self.file_name_path)
v00layout.addWidget(self.lab_rules)
v00layout.addWidget(self.rules_content)
vlistlayout.addWidget(self.stop_words_rules)
vlistlayout.addWidget(self.stop_words_content)
vlistsum.addLayout(v00layout)
vlistsum.addLayout(vlistlayout)
vlistsum.setStretchFactor(v00layout,5)
vlistsum.setStretchFactor(vlistlayout,3)
v01layout.addWidget(self.file_content_name)
v01layout.addWidget(self.file_content)
v01layout.addWidget(self.save_file_fenci)
v10layout.addWidget(self.lab_info)
v10layout.addWidget(self.info_content)
v11layout.addWidget(self.lab_finish)
v11layout.addWidget(self.finishi_content)
v11layout.addWidget(self.save_file)
v0layout.addLayout(vlistsum)
v0layout.addLayout(v10layout)
v0layout.setStretchFactor(vlistsum,7)
v0layout.setStretchFactor(v10layout,3)
v1layout.addLayout(v01layout)
v1layout.addLayout(v11layout)
alllayout.addLayout(v0layout)
alllayout.addLayout(v1layout)
alllayout.setStretchFactor(v0layout,4)
alllayout.setStretchFactor(v1layout,3)
self.setLayout(alllayout)
def __openByIODevice(self, fileName):
fileDevice = QFile(fileName)
if not fileDevice.exists():
return False
if not fileDevice.open(QIODevice.ReadOnly | QIODevice.Text):
return False
try:
self.rules_content.clear()
while not fileDevice.atEnd():
qtBytes = fileDevice.readLine() # 返回QByteArray类型
pyBytes = bytes(qtBytes.data()) # QByteArray转换为bytes类型
lineStr = pyBytes.decode("utf-8") # bytes转换为str型
lineStr = lineStr.strip() # 去除结尾增加的空行
self.rules_content.append(lineStr)
finally:
fileDevice.close()
return True
def on_actQFile_Open_triggered(self):
curPath = QDir.currentPath()
title = "打开一个文件"
filt = "文本文件(*.txt);;csv文件(*.csv);;程序文件(*.h *.py);;所有文件(*.*)" # 文件过滤器
fileName, flt = QFileDialog.getOpenFileName(self, title, curPath, filt)
if (fileName == ""):
return
if self.__openByIODevice(fileName):
self.file_name_path.setText(fileName)
print("文件已打开!")
else:
print("错误", "打开文件失败")
def __saveByIODevice(self,fileName): ##用QFile保存文件
fileDevice = QFile(fileName)
if not fileDevice.open(QIODevice.WriteOnly | QIODevice.Text):
return False
try:
text = self.rules_content.toPlainText() # 返回str类型
strBytes = text.encode("utf-8") # str转换为bytes类型
fileDevice.write(strBytes) # 写入文件
finally:
fileDevice.close()
return True
def saveByIODevice(self,fileName): ##用QFile保存文件
fileDevice = QFile(fileName)
if not fileDevice.open(QIODevice.WriteOnly | QIODevice.Text):
return False
try:
text = self.file_content.toPlainText() # 返回str类型
strBytes = text.encode("utf-8") # str转换为bytes类型
fileDevice.write(strBytes) # 写入文件
finally:
fileDevice.close()
return True
def on_actQFile_Save_triggered(self):
curPath = QDir.currentPath() # 获取系统当前目录
title = "另存为一个文件" # 对话框标题
filt = "文本文件(*.txt);;Python程序(*.py);;文本文件(*.xlsx);;csv文件(*.csv);;所有文件(*.*)" # 文件过滤器
fileName, flt = QFileDialog.getSaveFileName(self, title, curPath, filt)
if (fileName == ""):
return
if self.saveByIODevice(fileName):
print("文件保存成功:",fileName)
else:
print("错误", "保存文件失败")
# QMessageBox.critical(self, "错误", "保存文件失败")
def saveByIODevice2(self,fileName): ##用QFile保存文件
fileDevice = QFile(fileName)
if not fileDevice.open(QIODevice.WriteOnly | QIODevice.Text):
return False
try:
text = self.finishi_content.toPlainText() # 返回str类型
strBytes = text.encode("utf-8") # str转换为bytes类型
fileDevice.write(strBytes) # 写入文件
finally:
fileDevice.close()
return True
def on_actQFile_Save_triggered2(self):
curPath = QDir.currentPath() # 获取系统当前目录
title = "另存为一个文件" # 对话框标题
filt = "文本文件(*.txt);;Python程序(*.py);;文本文件(*.xlsx);;csv文件(*.csv);;所有文件(*.*)" # 文件过滤器
fileName, flt = QFileDialog.getSaveFileName(self, title, curPath, filt)
if (fileName == ""):
return
if self.saveByIODevice2(fileName):
print("文件保存成功:",fileName)
else:
print("错误", "保存文件失败")
# QMessageBox.critical(self, "错误", "保存文件失败")
def QFile_Save(self):
fileName = self.file_name_path.text()
if fileName == "":
print("你没有打开任何文件......")
if self.__saveByIODevice(fileName):
print("保存成功!\n文件路径:",fileName)
else:
print("保存文件失败")
def help_info(self):
QMessageBox.about(self, "提示信息:", "分词模块主要利用jieba分词工具对文本数据进行分词处理,同时还加入了去停用词功能,即去掉一些无意义的词条,支持自定义停用词词表!这一个大模块的功能非常重要,因为词频统计、LDA主题模型等多个方面都是基于词语展开的!同时分词功能中还加入了多个模式的分词。")
def jieba_cut(self):
self.finishi_content.clear()
self.file_content.clear()
cut_words = ""
cut_words2 = ""
all_words = ""
cut_re = ""
line = self.rules_content.toPlainText()
cut_word_list = self.stop_words_content.toPlainText().split()
cut = set(cut_word_list)
if line == "":
print("待处理文件为空!!!")
elif self.stop_words_content.toPlainText() == "":
print("请设置停用词列表,可点击默认列表使用系统列表!")
else:
line.strip('\n')
fir_result = jieba.cut(line)
for word in fir_result:
if word not in cut:
if word !='\t' and word != " ":
cut_re += word
cut_re += " "
seg_list = jieba.cut(line, cut_all=True)
words = jieba.posseg.cut(str(cut_re).replace(" ",""))
ccc = jieba.cut(str(cut_re).replace(" ",""))
all_words += (" ".join(ccc))
for word, flag in words:
loop = QEventLoop()
QTimer.singleShot(1, loop.quit)
loop.exec_()
self.finishi_content.append(word+','+flag)
print(all_words)
self.file_content.append(str(cut_re))
# for i in all_words:
# print(i)
# 词频统计
c = Counter()
for x in all_words.split(" "):
if len(x) >= 1 and x != '\r\n':
c[x] += 1
# 输出词频最高的前N个词
# print('\n词频统计结果:')
for (k, v) in c.most_common(60):
loop = QEventLoop()
QTimer.singleShot(100, loop.quit)
loop.exec_()
# print("%s:%d" % (k, v))
def default_cut_list(self):
filename = "NLPIR_stopwords.txt"
pf = open(filename, "r", encoding='UTF-8')
a = pf.read()
self.stop_words_content.setText(a)
self.selected = False
class Stream(QObject):
newText = pyqtSignal(str)
def write(self, text):
self.newText.emit(str(text))
if __name__ == '__main__':
app = QApplication(sys.argv)
ex = Jieba_Main_Window()
sys.exit(app.exec_())