#!/user/bin/evn python
import os,re,openpyxl
'''
输入:帆软脚本文件路径
输出:帆软文件检查结果Excel
'''
#获取来源表
def table_scan(sql_str):
# remove the /* */ comments
q = re.sub(r"/\*[^*]*\*+(?:[^*/][^*]*\*+)*/", "", sql_str)
# remove whole line -- and # comments
lines = [line for line in q.splitlines() if not re.match("^\s*(--|#)", line)]
# remove trailing -- and # comments
q = " ".join([re.split("--|#", line)[0] for line in lines])
# split on blanks, parens and semicolons
tokens = re.split(r"[\s)(;]+", q)
# scan the tokens. if we see a FROM or JOIN, we set the get_next
# flag, and grab the next one (unless it's SELECT).
result = []
get_next = False
for token in tokens:
if get_next:
if token.lower() not in ["", "select"]:
#过滤掉因条件设置选择来源表而产生的脏数据非表名字符
if '"+if' not in token and '"'not in token and '$if' not in token and '${if' not in token:
result.append(token.replace('`',''))
get_next = False
get_next = token.lower() in ["from", "join"]
#特殊单独情况处理:from后面来源表条件选择对应来源表,比如from ${if(XX,"来源表A","来源表B")}
# print(result)
if not result:
tab_pat=re.compile(r'from.*?if\((.*?)\)',re.S)
for i in re.findall(tab_pat,sql_str):
temp_r=i.replace('"','').split(',')
if '' not in temp_r and '/*' not in temp_r:
result.append(temp_r[1].replace('`','').strip())
result.append(temp_r[2].replace('`','').strip())
return result
#文件扫描,使用正则解析第一版,准确性不太高!
def file_scan(path):
f_content=open(path,'r',encoding='utf-8').read()
#1、数据集查询
sqlgpat=re.compile('<TableDataMap>(.*?)</TableDataMap>',re.S)
if_has_sqlg=re.findall(sqlgpat,f_content)
rs_sql_list=[]
if if_has_sqlg:
#获取数据集名称以及数据集查询语句
sqlspat=re.compile('<TableData name="(.*?)".*?<DatabaseName>\n<!\[CDATA\[(.*?)]]></DatabaseName>.*?<Query>\n<!\[CDATA\[(.*?)]]></Query>.*?</TableData>',re.S)
rs1=re.findall(sqlspat,if_has_sqlg[0])
for rsv in rs1:
from_tables=[]
if '"*/"' in rsv[1]:
sql=rsv[1].split('*/')
for ss in sql:
from_tables.extend(table_scan(ss))
else:
from_tables.extend(table_scan(rsv[2]))
rs_sql_list.append([rsv[0],rsv[1],rsv[2],set(from_tables)])
# print(rsv[1])
#2、js获取
if_has_jsgpat=re.compile('<NameJavaScript name="(.*?)</NameJavaScript>',re.S)
if_has_jsg=re.findall(if_has_jsgpat, f_content)
rep_list = [] # 报表列表,去重
if if_has_jsg:
for jscon in if_has_jsg:
# conturlpat=re.compile('<Content>.*?var\surl\s=.*?viewlet=(.*?.[cptfrm]{3})&.*?</Content>',re.S)
conturlpat = re.compile('<Content>.*?viewlet=(.*?[cptfrm]{3})[&?].*?</Content>', re.S)
if '<JavaScript class="com.fr.js.ReportletHyperlink">' in jscon:
# rlpat=re.compile('<ReportletName extendParameters="true" showPI="true">\s<!\[CDATA\[(.*?)]]></ReportletName>',re.S)
rlpat = re.compile( '<ReportletName .*?\[CDATA\[(.*?)]]></ReportletName>', re.S)
rl=re.findall(rlpat,jscon)[0]
# print(re.findall(rlpat,jscon))
if rl not in rep_list:
rep_list.append(rl)
elif '<JavaScript class="com.fr.js.WebHyperlink">' in jscon:
wlpat=re.compile('<URL>\s<!\[CDATA\[(.*?)]]></URL>',re.S)
wl=re.findall(wlpat,jscon)[0]
if wl not in rep_list:
rep_list.append(wl)
elif re.search(conturlpat,jscon):
frl=re.findall(conturlpat,jscon)[0]
print(frl)
if not frl.startswith('/'):
frl='/'+frl
if frl not in rep_list:
rep_list.append(frl)
# elif '<JavaScript class="com.fr.js.JavaScriptImpl">'in jscon and('.cpt' in jscon or '.frm' in jscon) :
# print(jscon)
if_has_cljpat=re.compile(r'<RHIframeSource.*?<Attr path="(.*?[cptfrm]{3}).*?</RHIframeSource>',re.S)
f_has_clj=re.findall(if_has_cljpat,f_content)
if f_has_clj:
for v in f_has_clj:
if v not in rep_list:
rep_list.append(v)
# print(rep_list)
return rep_list,rs_sql_list
#使用xml解析精准获取
def xml_scan(path):
import xml.etree.ElementTree as ET
tree = ET.parse(path) # 打开xml文件
dataset_iters = [] # 数据集名称,数据集数据库链接名,数据集查询语句,数据集来源sql表
if list(tree.getroot().iter("TableDataMap")):
# 数据集TableDataMap父节点
table_map_content = list(tree.getroot().iter("TableDataMap"))[0]
# 获取数据集查询名称
dataset_iters_map = table_map_content.iter('TableData')
for val in dataset_iters_map:
# print('查询名称--',val.attrib.get("name"))
dataset_name = val.attrib.get("name").strip()
if len(list(val.iter("DatabaseName"))):
# 帆软目前一个数据集查询框只能链接单个数据库,所以获取数据库链接名只有1个
# print('查询数据库链接名--', list(val.iter("DatabaseName"))[0].text.strip())
dataset_connect_name = list(val.iter("DatabaseName"))[0].text.strip()
else:
# print('查询数据库链接名--',None)
dataset_connect_name = None
if len(list(val.iter("Query"))):
# 帆软目前一个数据集查询框只能链接单个数据库,所以获取数据库链接名只有1个,且只有一个sql查询窗口内容
# print('查询数据查询语句--', list(val.iter("Query"))[0].text.strip())
dataset_query = list(val.iter("Query"))[0].text.strip()
from_tables = []
if '"*/"' in dataset_query:
sql = dataset_query.split('*/')
for ss in sql:
# print(ss)
from_tables.extend(table_scan(ss))
else:
from_tables.extend(table_scan(dataset_query))
else:
# print('查询数据查询语句--', None)
dataset_query = None
from_tables=[]
dataset_iters.append([dataset_name, dataset_connect_name, dataset_query,from_tables])
urls = set() # 报表全体下游调用URL集合
js_contents = [] # js内容,内容清洗出来的URL,用于核对数据清洗是否准确
# print(len(list(tree.iter("ReportletName"))))#js链接报表-网格报表-本地服务器
local_url = [v.text.strip() for v in tree.iter("ReportletName")]
if local_url:
urls.update(local_url)
# print(len(list(tree.iter("URL")))) # js链接报表-网格报表-远程web链接
web_url = [v.text.strip() for v in tree.iter("URL")]
if web_url:
urls.update(web_url)
# print(len(list(val.iter("RHIframeSource"))))# js链接报表-tab框架挂载报表
for v in tree.iter("RHIframeSource"):
webframe_url = list(v.iter("Attr"))[0].attrib.get("path")
# 去除URL尾巴参数
if webframe_url and not webframe_url.endswith("frm") and not webframe_url.endswith("cpt"):
rpat = re.compile(r'.*?[cptfrm]{3}', re.I)
webframe_url = re.findall(rpat, webframe_url)[0]
urls.update([webframe_url])
elif webframe_url:
urls.update([webframe_url])
# print(len(list(val.iter("Content"))))
for cv in list(tree.iter("Content")):
contents = cv.text
temp_url = []
# print(contents)
http_ul_pat = re.compile(r'"(http.*?)"') #js内容里面挂载web超链接
local_ul_pat = re.compile(r'viewlet=(.*?[cptfrm]{3})')#js内容里面挂载服务器本地绝对路径报表链接
# print(re.findall(http_ul_pat,contents))
# print(re.findall(local_ul_pat, contents))
if re.findall(http_ul_pat, contents):
urls.update(re.findall(http_ul_pat, contents))
temp_url.extend(re.findall(http_ul_pat, contents))
if re.findall(local_ul_pat, contents):
# print(re.findall(local_ul_pat, contents))
#处理挂载服务器本地链接路径,有些挂载绝对目录不规范a/b/c.cpt处理后输出/a/b/c.cpt
for vl in re.findall(local_ul_pat, contents):
if vl.startswith('/'):
urls.update([vl])
temp_url.append(vl)
else:
urls.update(['/'+vl])
temp_url.append('/'+vl)
js_contents.append([contents, temp_url])
# print(js_contents)
return dataset_iters,urls,js_contents
def write_excel(list_tar,file_path):
wb = openpyxl.Workbook() # 新建工作簿
sheet0=wb[wb.sheetnames[0]]
sheet0.title=('引用报表列表')
sheet1 = wb.create_sheet('来源mysql表')
sheet2 = wb.create_sheet('帆软数据集查询及依赖明细')
sheet3 = wb.create_sheet('帆软JS内容明细')
sheet0['A1'] = '文件名'
sheet0['B1'] = '依赖报表'
sheet1['A1'] = '文件名'
sheet1['B1'] = '依赖mysql表'
sheet2['A1'] = '文件名'
sheet2['B1'] = '数据集查询名称'
sheet2['C1'] = '数据库链接名称'
sheet2['D1'] = '数据集查询语句'
sheet2['E1'] = '数据来源mysql表'
sheet3['A1'] = '文件名'
sheet3['B1'] = 'JS内容'
sheet3['C1'] = 'JS解析URL'
r=1
k=1
d=1
x=1
for index,item in enumerate(list_tar):
print(('开始处理第 '+str(index+1)+' 个文件结果,共 '+str(len(list_tar))+' 个').center(50,'-'))
# filename,dataset_iters, urls, js_contents
# dataset_iters = [] # 数据集名称,数据集数据库链接名,数据集查询语句,数据集来源sql表
target_file_name=item[0]
cpt=item[2]
sql=item[1]
jsc=item[3]
for id1,value in enumerate(sorted(cpt)):
r=r+1
sheet0.cell(row=r, column=1, value=target_file_name)
sheet0.cell(row=r, column=2, value=value)
sql_set=set()
for id1,val in enumerate(sql):
k = k + 1
sql_set.update(val[3])
sheet2.cell(row=k, column=1, value=target_file_name)
sheet2.cell(row=k, column=2, value=val[0])
sheet2.cell(row=k, column=3, value=val[1])
sheet2.cell(row=k, column=4, value=val[2])
sheet2.cell(row=k, column=5, value='\n'.join(val[3]))
for id1,value in enumerate(sorted(sql_set)):
d = d + 1
sheet1.cell(row=d, column=1, value=target_file_name)
sheet1.cell(row=d, column=2, value=value)
for id1, value in enumerate(sorted(jsc)):
if value[0] or value[1]:
x = x +1
sheet3.cell(row=x, column=1, value=target_file_name)
sheet3.cell(row=x, column=2, value=value[0])
sheet3.cell(row=x, column=3, value='\n'.join(value[1]))
wb.save(file_path)
wb.close() # excel使用完成需要关闭,否则会报错
def main_scan(fr_path,result_path):
rs_list=[]
file_list=[]
file_name=[]
for dirpath, dirnames, filenames in os.walk(fr_path):
for file in filenames:
file_list.append(os.path.join(dirpath,file))
file_name.append(os.path.join(dirpath,file).replace(r'【清空前缀:本地机扫描文件夹绝对路径】','').replace('\\','/'))
for index,file in enumerate(file_list):
print(('正在扫描第 '+str(index+1)+' 个文件,共 '+str(len(file_list))+' 个文件').center(50,'-'))
try:
dataset_iters,urls,js_contents = xml_scan(file)
rs_list.append([file_name[index],dataset_iters,urls,js_contents])
except:
print('【文件扫描失败】:',file)
print('文件扫描完毕,正在写入Excel'.ljust(50,'-'))
write_excel(rs_list, result_path)
if __name__ == '__main__':
#帆软扫描文件夹绝对路径
fr_path=r'【本地机扫描文件夹绝对路径】'
#帆软扫描结果文件绝对路径
result_path=r'【本地机结果路径】\scaning_result.xlsx'
main_scan(fr_path, result_path)
扫描文件夹:
运行结果