python处理word文件

最近做word工作比较多,上次用了一个邮件功能,从execl中读取数据填空生成大量word页面。后来又用到了通配符和非通配符查找数字,如果不打开通配符的话^#^#^#^#^#^#^#^#表示连续的8个数字,勾选通配符复选框后/[0-9]{1,3}/表示连续的1-3个数字

而今天要处理一堆文件,把其中的内容挑到execl中去

python处理docx比较方便,而所给的文件除了一个之外都是doc,所以首先把所有doc文件另存为docx文件。代码如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# -*- coding: utf-8 -*-
import os
import win32api
import win32com
from win32com.client import Dispatch


def listdir(path, list_name):
for file in os.listdir(path):
file_path = os.path.join(path, file)
if os.path.isdir(file_path):
listdir(file_path, list_name)
elif os.path.splitext(file_path)[1] == '.doc':
list_name.append(file_path)

def doc2docx(path):
w = win32com.client.Dispatch('Word.Application')
w.Visible = 0
w.DisplayAlerts = 0
doc = w.Documents.Open(path)
newpath = os.path.splitext(path)[0] + '.docx'
doc.SaveAs(newpath, 12, False, "", True, "", False, False, False, False)
doc.Close()
w.Quit()
os.remove(path)
return newpath



xt = []
listdir(r'C:\Users\25865\test', xt)

for name in xt:
doc2docx(name)
print(name)

os.system("pause")

调试过程效果如图所示

下一步,找到要找的元素的坐标位置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import docx
import csv
import os

def listdirdocx(path, list_name):
for file in os.listdir(path):
file_path = os.path.join(path, file)
if os.path.isdir(file_path):
listdir(file_path, list_name)
elif os.path.splitext(file_path)[1] == '.docx':
list_name.append(file_path)

def pickinformation(path):
#fn = r'E:\abc\test.docx'
fn = path
doc = docx.Document(fn)

#for paragraph in doc.paragraphs:
# print(paragraph.text)

for table in doc.tables:
i = 0
j = 0
for row in table.rows:
i=i+1
for cell in row.cells:
j=j+1
if cell.text == "艺术设计学院":
print ("学院:i = " + str(i) + " j = " + str(j))
elif cell.text == "针对“阿尔兹海默症”的公益传播研究":
print ("课题名称:i = " + str(i) + " j = " + str(j))
elif cell.text == "唐颖":
print ("指导老师:i = " + str(i) + " j = " + str(j))
elif cell.text == "67395285":
print ("办公电话: i = " + str(i) + " j = " + str(j))
elif cell.text == "13621182221":
print ("手机号码: i = " + str(i) + " j = " + str(j))
elif cell.text == "tangying@bjut.edu.cn":
print ("电子邮箱: i = " + str(i) + " j = " + str(j))
elif j>=10000:
print ("j>=10000")
break
if i>=10000:
print ("i>=10000")
break
xt = []
listdirdocx(r'C:\Users\25865\test1', xt)
for name in xt:
print(name)
pickinformation(name)


os.system("pause")

再下一步将内容提取出来,存为csv文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import docx
import csv
import os

def listdirdocx(path, list_name):
for file in os.listdir(path):
file_path = os.path.join(path, file)
if os.path.isdir(file_path):
listdir(file_path, list_name)
elif os.path.splitext(file_path)[1] == '.docx':
list_name.append(file_path)

def pickinformation(path):
#fn = r'E:\abc\test.docx'
fn = path
doc = docx.Document(fn)

#for paragraph in doc.paragraphs:
# print(paragraph.text)

for table in doc.tables:
i = 0
j = 0
temp = [str(num),'信息学部']
for row in table.rows:
i=i+1
for cell in row.cells:
j=j+1
if i == 1 and j == 4:
#print(str(i) + "," + str(j))
print(cell.text)
temp.append(cell.text)
elif i == 5 and j == 20:
print(cell.text)
temp.append(cell.text)
append_csv(r'C:\Users\25865\example.csv',[temp])

def create_csv(path):
with open(path, "w+", newline='') as file:
csv_file = csv.writer(file)
head = ["序号","学院","课题名称","指导老师"]
csv_file.writerow(head)

def append_csv(path, datas):
with open(path, "a", newline='') as file: # 处理csv读写时不同换行符 linux:\n windows:\r\n mac:\r
csv_file = csv.writer(file)
csv_file.writerows(datas)


create_csv(r'C:\Users\25865\example.csv')
xt = []
listdirdocx(r'C:\Users\25865\test', xt)
num = 0
for name in xt:
num = num + 1
print(name)
pickinformation(name)


os.system("pause")

调试过程如下

最后将csv用excel打开,另存为xlsx

大功告成!

(沙雕老师们真的是刷新了我认知的下限,一个填了联系方式但是忘写姓名的居然是传闻对学长资源倾斜严重以至于让学长保研清华的年轻教授,好在认识还能填上人名(明明之前听学长的描述对这个老师印象超好的说,要不是研究方向我不是很擅长我就像同班同学一样也去找他了😂,嘛,虽然现在印象也不错,毕竟是四个老师中沙雕程度最低的一个);另一个所有个人信息忘了填,估计之后还得纳闷为啥没人找他做项目呢。还有两个项目名称爬不出来的,好在现在的windows文件系统搜索可以搜索到文件内的内容,直接搜一下就好了,结果一看,一个是填的超链接,一个是一人多开8个项目的其中一个项目贴了一张图进去,简直是人类究极迷惑行为,质疑一下他们真的认真想过给本科生做啥项目了么,那一个人带那么多组本科生的是不是根本就没想过管项目的死活?)