Commit 14494d3e by MSI-NB

爬虫

parents
from bs4 import BeautifulSoup
import requests
import time
from beautifulsoup.parameters import parameters
class html_tools(object):
def __init__(self):
pass
def get_driver_id(self,html_doc):
ids = []
bsp = BeautifulSoup(html_doc, 'html.parser',from_encoding="iso-8859-1")
datas = bsp.find_all('a')
datas = filter(lambda x:x != None,[ele.get('href') for ele in datas])
for e in datas:
start = e.find("'")
end = e.find("'", start + 1)
id = e[start + 1:end]
ids.append(id)
return ids
#获取当前页面总页数
def total_page(self,html_doc):
bsp = BeautifulSoup(html_doc, 'html.parser',from_encoding="iso-8859-1")
def is_page(div):
classes = div.get('class')
if classes == ['rit', 'mar10']: return True
else: return False
divs = bsp.find_all('div')
div = filter(is_page, divs)
div_label = str(list(div)[0])
start_index = div_label.find('录 共')
if start_index == -1:
return 1
end_index = div_label.find('页 转')
print('start_index:',start_index)
print('end_index:', end_index)
return int(div_label[start_index + 3:end_index])
def get_data(self,html_doc):
# datas = {}
data_list = []
bsp = BeautifulSoup(html_doc, 'html.parser',from_encoding="iso-8859-1")
tab = bsp.table #获取第一个table
# print(tab)
rows = tab.find_all('tr') #获取每一行数据
for row in rows:
tds = row.find_all('td')
tds = filter(lambda td: td.get('class') is None, tds)
for td in tds:
text = td.get_text()
text = text.replace('\r','').replace('\n','').replace('\t','').replace(' ','').replace('\xa0','')
data_list.append(text)
return data_list
def retry(self,url,parameter):
global res
id_html_doc = requests.get(url, headers=parameter.create_id_header()).content
try:
res = self.get_data(id_html_doc)
except Exception:
# time.sleep(1)
self.retry(url,parameter)
return res
if __name__ == '__main__':
def is_page(div):
classes = div.get('class')
if classes == ['rit', 'mar10']: return True
else: return False
parameter = parameters()
html_doc = requests.post(parameter.driver_url,data=parameter.create_one_page_parameter(), headers=parameter.create_one_page_header()).content
bsp = BeautifulSoup(html_doc, 'html.parser')
divs = bsp.find_all('div')
# classes = [ele.get('class') for ele in divs]
# print(classes)
div = filter(is_page, divs)
# div = filter(lambda x: x == ['rit', 'mar10'], [ele.get('class') for ele in divs])
div_label = str(list(div)[0])
index = div_label.find('页 转')
print(index)
print(div_label[987:989])
# a = [1,2,3,4]
# b = filter(lambda x:x > 2,a)
# for i in b:
# print(i)
import cv2 as cv
import os
import numpy as np
class image_tools(object):
def __init__(self):
self.image_dir = r'D:\pic3'
self.save_image_path = r'D:\pic4'
def get_image_paths(self):
dirs = os.listdir(self.image_dir)
return [os.path.join(self.image_dir,dirs[i]) for i in range(len(dirs))]
def valid(self,image, x, y):
if x < 0 or x >= image.shape[0] or y < 0 or y >= image.shape[1]:
return False
else:
return True
# 判断某个点是否为噪点,after_table_b用于描点画图,可以改变level以调节去噪深度
def clear_noise_pixel_binary(self,image, x, y, after_table_b, level):
now = image[x, y]
flag = 0
for i in range(-1, 2):
for j in range(-1, 2):
if i == 0 and j == 0:
continue
if self.valid(image, x + i, y + j):
if image[x + i, y + j] == 0:
flag += 1 # 计算该点周围黑色点的数量
# print(flag)
if now == 0 and flag < level:
after_table_b[x, y] = 255 # 去除操作,若该点为黑点,且周围黑点的数量小于level,则将该点变为白点
elif now == 1 and flag >= 4:
after_table_b[x, y] = 0 # 补充操作,若该点为白点,且周围黑点的数量大于等于4,则将该点变为黑点
else:
after_table_b[x, y] = now
#清除黑线
def clear_black(self,image, x, y, new_image):
now = image[x, y]
# print(now)
if now[0] <= 30 and now[1] <= 30 and now[2] <= 30:
new_image[x, y] = (255, 255, 255)
else:
new_image[x, y] = now
'''将验证码分成4张图片'''
def image_splite_one_image(self,image_path):
img = cv.imread(image_path)
h, w, ch = img.shape
'''去除黑线'''
no_black_line_img = np.zeros(shape=(h, w, 3), dtype=img.dtype)
for row in range(h):
for col in range(w):
self.clear_black(img, row, col, no_black_line_img)
'''转为灰度'''
gray_image = cv.cvtColor(no_black_line_img, cv.COLOR_BGR2GRAY)
'''二值化'''
T = cv.mean(gray_image)[0]
binary_image = np.zeros(shape=gray_image.shape, dtype=gray_image.dtype)
for row in range(h):
for col in range(w):
px = gray_image[row, col]
if px > T:
binary_image[row, col] = 255
else:
binary_image[row, col] = 0
'''去除椒盐噪音,不建议用中值滤波'''
pic = np.zeros(binary_image.shape, dtype=binary_image.dtype)
for row in range(h):
for col in range(w):
self.clear_noise_pixel_binary(binary_image, row, col, pic, 3)
return pic[0:20, 12:28],pic[0:20,28:44],pic[0:20,44:60],pic[0:20,60:76]
def image_spilt_all(self):
image_dirs = self.get_image_paths()
for name,image_path in enumerate(image_dirs):
pic1,pic2,pic3,pic4 = self.image_splite_one_image(image_path)
cv.imwrite(self.save_image_path + '\\' + str(name) + '_' + '1' + '.jpg', pic1)
cv.imwrite(self.save_image_path + '\\' + str(name) + '_' + '2' + '.jpg', pic2)
cv.imwrite(self.save_image_path + '\\' + str(name) + '_' + '3' + '.jpg', pic3)
cv.imwrite(self.save_image_path + '\\' + str(name) + '_' + '4' + '.jpg', pic4)
cv.waitKey(0)
cv.destroyAllWindows()
def morphology(self,path):
src = cv.imread(path)
k = np.zeros((1,1),np.uint8)
# res = cv.morphologyEx(src,cv.MORPH_CLOSE,k,iterations=3)
res = cv.erode(src,k)
cv.imshow('src', src)
cv.imshow('res',res)
cv.waitKey(0)
cv.destroyAllWindows()
if __name__ == '__main__':
# tool = image_tool()
# tool.image_spilt()
# tool.morphology(r'D:\pic4\6_3.jpg')
src = cv.imread(r'D:\pic4\6_3.jpg')
# print(src)
杭州长运西北汽车客运有限公司 79667012-1 88180060
\ No newline at end of file
import numpy as np
import pandas as pd
path = r'C:\Users\MSI-NB\Desktop\门架车流.xlsx'
sheet = pd.read_excel(path,'16-19',header=None).values
name1 = set(sheet[:,0])
name2 = set(sheet[:,3])
#提取公共的name
names = name1.intersection(name2)
print(names)
data1 = dict(zip(list(sheet[:,0]),list(sheet[:,1])))
data2 = dict(zip(list(sheet[:,3]),list(sheet[:,4])))
data1_total = []
data2_total = []
for name in names:
if data1.get(name) != None and data2.get(name) != None:
res = name + '\t' + str(int(data1.get(name))) + '\t' + name + '\t' + str(int(data2.get(name)))
print(res)
#
# print(sum(data1_total))
# print(sum(data2_total))
\ No newline at end of file
"""
*@author: luliang
*@time: 2020-11-18 09:23:12
*@desc: 杭州交警平台驾驶人和违法数据爬取,每日4点爬取新增数据
"""
import requests
import datetime
import time
import json
from hdfs.client import Client
import smtplib
from email.mime.text import MIMEText
from beautifulsoup.parameters import parameters
from beautifulsoup.html_analyze import html_tools
from dateutil import relativedelta
from dateutil import rrule
#爬取驾驶人查询
def get_drivers_info(company_name,parameter):
if parameter.session == None:
return
print('【.........开始爬取驾驶人信息.........】')
pageNum = 1 # 记录当前爬取第几页
driver_result = []
total_pages = 0 #记录要爬取的总页数
while True :
print('【爬取第{}页数据】'.format(str(pageNum)))
if pageNum == 1: #爬取第一页
html_doc = requests.post(parameter.driver_url,data=parameter.create_one_page_parameter(), headers=parameter.create_one_page_header()).content
total_pages = html_tool.total_page(html_doc)
else:
html_doc = requests.post(parameter.driver_url,data=parameter.create_another_page_parameter(pageNum), headers=parameter.create_another_page_header()).content
#获取每一页的驾驶人ID
# print('【pages】 = ',pages)
ids = html_tool.get_driver_id(html_doc)
#解析当前的所有id,返回所要的结果
id_urls = [parameter.create_id_url(id) for id in ids]
# print(id_urls)
for url in id_urls:
# time.sleep(1)
res = html_tool.retry(url,parameter)
driver_result.append(res)
# print(res)
if pageNum == total_pages:
break
pageNum += 1
return write_to_hdfs_partiton(driver_result, 'driver_basic_info',company_name)
# df = pd.DataFrame(driver_result,columns=parameter.driver_column)
# df.to_excel(parameter.driver_excel,index=None)
#爬取驾驶人违法信息
def get_driver_vio(company_name,parameter):
#获取要查询的时间
print('【.........开始爬取驾驶人违法信息.........】')
now = datetime.datetime.now().date()
start = datetime.datetime(2016, month=1, day=1).date()
end = datetime.datetime(now.year, now.month, 1).date()
month_count = rrule.rrule(rrule.MONTHLY, dtstart=start, until=end).count()
days = [(start + relativedelta.relativedelta(months=i)).strftime('%Y-%m-%d') for i in range(month_count)]
vio_results = []
# days = ['2019-09-01']
for day in days:
print('【爬取驾驶人{}的违法数据】'.format(day))
response = requests.post(parameter.create_vio_url(),data=parameter.create_vio_parameter(day,company_name),headers = parameter.create_vio_header(day)).content
response = str(response)
start = response.find('[')
end = response.find(']') + 1
# print(response[start:end])
json_str = response[start:end].replace("dwmc","\"dwmc\"").replace("clxx","\"clxx\"").replace("jszh","\"jszh\"").replace("wfsj","\"wfsj\"").replace("xm","\"xm\"").replace("wfnr","\"wfnr\"")
if json_str != "":
datas = json.loads(json_str)
for data_dict in datas:
# print(data_dict)
vio_results.append([v.encode('utf-8').decode('unicode_escape') for v in list(data_dict.values())])
# print('--------------vio_results:',str(len(vio_results)))
return write_to_hdfs_partiton(vio_results,'lawbreaking_data',company_name)
# print(vio_results)
def write_to_hdfs_partiton(datas,table,company_name):
try:
client = Client("http://10.22.17.21:50070", root='/')
except Exception:
client = Client("http://10.22.17.20:50070", root='/')
insert_time = time.strftime("%Y-%m-%d", time.localtime())
[data.insert(0, company_name) for data in datas]
add_data = list(filter(lambda x: ''.join(x).strip() != '', datas))
[ad.append(insert_time) for ad in add_data]
add_data_to_str = ['\t'.join(data) for data in add_data]
# print(add_data_to_str[0:5])
result = '\n'.join(add_data_to_str)
print('【开始向HDFS写入数据】')
client.write(hdfs_path='/apps/hive/warehouse/creditscores.db/{}/{}/{}'.format(table,insert_time,company_name), data=result,
append=False, encoding='utf-8')
return len(datas)
def send_email(content,mess):
current_day = str(datetime.datetime.now().date())
subject = current_day + ':' + content + '(*^-^*)' # 邮件标题
sender = "15330344779@163.com" # 发送方
receiver = "15330344779@163.com" # 接收方
# 授权码,不是邮箱密码
password = "ISRLMHOENTBBWGSO"
message = MIMEText(mess, "plain", "utf-8")
# content 发送内容 "plain"文本格式 utf-8 编码格式
message['Subject'] = subject # 邮件标题
message['To'] = receiver # 收件人
message['From'] = sender # 发件人
smtp = smtplib.SMTP_SSL("smtp.163.com", 994) # 实例化smtp服务器
smtp.login(sender, password) # 发件人登录
smtp.sendmail(sender, [receiver], message.as_string())
smtp.close()
if __name__ == '__main__':
html_tool = html_tools()
#读取配置文件
file = open('/root/javi/config/company.conf','r', encoding='utf-8')
dataMat = []
for line in file.readlines():
dataMat.append(line.strip().split(' '))
drivers_info = {}
vio_info = {}
for company_info in dataMat:
company_name = company_info[0]
print('【当前爬取公司:{}】'.format(company_name))
username = company_info[1]
password = company_info[2]
parameter = parameters(username,password)
drivers_add_count = get_drivers_info(company_name,parameter)
drivers_info[company_name] = drivers_add_count
vio_add_count = get_driver_vio(company_name,parameter)
vio_info[company_name] = vio_add_count
print('【数据爬取完成(*^__^*)】')
info_str = ''
for k,v in drivers_info.items():
info_str += '{}今日新增驾驶人数据:{}\n'.format(k,str(v))
for k,v in vio_info.items():
info_str += '{}今日新增违法数据:{}\n'.format(k, str(v))
send_email('爬取数据情况', info_str)
import tensorflow as tf
from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2
#将模型文件h5转换为pb格式,才能在opencv中调用
def convert_h5to_pb():
model = tf.keras.models.load_model(r'C:\Users\MSI-NB\Desktop\model\yzm_model.h5', compile=False)
model.summary()
full_model = tf.function(lambda Input: model(Input))
full_model = full_model.get_concrete_function(tf.TensorSpec(model.inputs[0].shape, model.inputs[0].dtype))
# Get frozen ConcreteFunction
frozen_func = convert_variables_to_constants_v2(full_model)
frozen_func.graph.as_graph_def()
# layers = [op.name for op in frozen_func.graph.get_operations()]
# print("-" * 50)
# print("Frozen model layers: ")
# for layer in layers:
# print(layer)
#
# print("-" * 50)
# print("Frozen model inputs: ")
# print(frozen_func.inputs)
# print("Frozen model outputs: ")
# print(frozen_func.outputs)
# Save frozen graph from frozen ConcreteFunction to hard drive
tf.io.write_graph(graph_or_graph_def=frozen_func.graph,
logdir=r'C:\Users\MSI-NB\Desktop\model',
name="yzm_model.pb",
as_text=False)
\ No newline at end of file
import tensorflow as tf
import numpy as np
from tensorflow import keras
import glob
import matplotlib.pyplot as plt
#%%
# 验证码图片地址
dirs = r'D:/pic2/*'
paths = glob.glob(dirs)
# print(paths)
dict = {'0':0,'a':1,'2':2,'3':3,'4':4,'5':5,
'6':6,'7':7,'8':8,'9':9,'b':10,'c':11,
'd':12,'e':13,'f':14,'g':15,'h':16,'i':17,
'j':18,'k':19,'m':20,'n':21,'o':22,'p':23,
'q':24,'r':25,'s':26,'t':27,'u':28,'v':29,
'w':30,'x':31,'y':32,'z':33}
all_path = []
labels = []
for path_dir in paths:
image_path = glob.glob(path_dir+"\\*")
all_path.extend(image_path)
labels.extend([path_dir[-1] for p in image_path])
all_label = [dict[label] for label in labels]
# print(all_path)
# print(all_label)
total = len(all_path)
#%%
dict2 = {value: key for key, value in dict.items()}
print(dict2)
#%%
def load_pic(path):
image_binary = tf.io.read_file(path)
image_tensor = tf.image.decode_jpeg(image_binary)
# image_tensor = tf.image.random_flip_left_right(image_tensor)
image_tensor = tf.cast(image_tensor,tf.float32)
image_tensor = image_tensor / 255
return image_tensor
# print(all_path[4])
# print(all_label[4])
# tensor = load_pic(all_path[4])
# print(tensor.shape)
# plt.imshow(tf.keras.preprocessing.image.array_to_img(tensor))
# plt.show()
#%%
np.random.seed(2020)
random_index = np.random.permutation(total)
image_dataset = np.array([load_pic(path) for path in all_path])
image_dataset = image_dataset[random_index]
label_dataset = tf.keras.utils.to_categorical(np.array(all_label))
label_dataset = label_dataset[random_index]
#%% 分训练集和测试集
train_total = int(total * 0.8)
test_total = total - train_total
train_ds = image_dataset[0:train_total]
train_label = label_dataset[0:train_total]
test_ds = image_dataset[train_total:]
test_label = label_dataset[train_total:]
#%%
model = tf.keras.Sequential()
model.add(tf.keras.layers.Conv2D(128,(3,3),input_shape=(20,16,1),activation='relu',padding='same'))
model.add(tf.keras.layers.Conv2D(64,(3,3),activation='relu',padding='same'))
model.add(tf.keras.layers.Conv2D(64,(2,2),activation='relu',padding='same'))
model.add(tf.keras.layers.GlobalAveragePooling2D())
model.add(tf.keras.layers.Dense(128,activation='relu'))
model.add(tf.keras.layers.Dense(64,activation='relu'))
model.add(tf.keras.layers.Dense(34,activation='softmax'))
# model.summary()
#%%
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer,
loss='categorical_crossentropy',
metrics=['acc'])
model.fit(x=train_ds,
y=train_label,
epochs=100,
validation_data=(test_ds,test_label))
model.save(r'C:\Users\MSI-NB\Desktop\model\yzm_model.h5')
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment