将使用win32com包进行处理
读取doc文件
# coding=utf-8import os, fnmatchfrom win32com import client as wcfrom win32com.client import Dispatchdef word2txt(filePath, savePath = ''):dirs, filename = os.path.split(filePath)print(dirs, '\n', filename)new_name = ''if fnmatch.fnmatch(filename, "*.docx"):new_name = filename[:-5] + '.txt'if fnmatch.fnmatch(filename, "*.doc"):new_name = filename[:-4] + '.txt'if savePath == '':savePath = dirselse:savePath = savePathword2txtPath = os.path.join(savePath, new_name)print(word2txtPath)wordappp = wc.Dispatch('Word.Application')mytxt = wordappp.Documents.Open(filePath)mytxt.SaveAs(word2txtPath, 4) # 4代表抽取结果保存为文本mytxt.Close()if __name__ == '__main__':filePath = os.path.abspath(r'./专业课.docx')word2txt(filePath)
读取pdf
# coding=utf-8import os, fnmatchfrom win32com import client as wcfrom win32com.client import Dispatchdef pdf2txt(filePath, savePath=''):dirs, filename = os.path.split(filePath)print(dirs, '\n', filename)new_name = ''if fnmatch.fnmatch(filename, '*.pdf') or fnmatch.fnmatch(filename, '*.PDF'):new_name = filename[:-4] + '.txt'else:print('格式不正确,仅支持pdf格式')returnif savePath == '':savePath = dirselse:savePath = savePathpdf2txtPath = os.path.join(savePath, new_name)print(pdf2txtPath)wordappp = wc.Dispatch('Word.Application')mytxt = wordappp.Documents.Open(filePath)mytxt.SaveAs(pdf2txtPath, 4) # 4代表抽取文本mytxt.Close()if __name__ == '__main__':filePath = os.path.abspath(r'./论文.pdf')pdf2txt(filePath)