[技术分享]Python3 网页爬取实战应用
stclair2201发布于1 年前 • 620 次阅读
这几天在做公司的生态在线加油服务,由于从外部资源中无法找到各个加油站准确的油价信息,转而想办法去寻找各自区域的油价,相当大多场景下与真实油价还是比较接近的。 所以最后确定去油价平台获取最新的各个区域油价数据。使用Python 开发,
import urllib.request
import re
import json
import time
import os
dataDir = "./data/"+time.strftime('%Y-%m-%d', time.localtime())+"/"
logDir = dataDir + "log/"
def getOtherOilType():
otherOilType = ["93","97"]
result = []
for oil in otherOilType:
htmlCode = getUrlContent('http://youjia.chemcp.com/'+oil+"haoqiyoujiage.asp")
htmlCode = htmlCode.replace(' ' , '').replace("\r\n" , '')
reg = r'<tdbgcolor="#FFFFFF"><ahref="/(.*?)/"target="_blank">(.*?)</a></td><tdbgcolor="#FFFFFF">(.*?)</td><tdbgcolor="#FFFFFF">(.*?)</td><tdbgcolor="#FFFFFF">(.*?)</td>'
reg = re.compile(reg)
prices = re.findall(reg, htmlCode)
for price in prices:
result.append({
'url':price[0],
'province':price[1],
'_'+oil:price[2]
})
text = json.dumps(result)
f = open(dataDir+"____9397.txt",'w')
f.write(text)
f.close()
def getProvinceList():
htmlCode = getUrlContent('http://youjia.chemcp.com/')
reg = r'<td bgcolor="#FFFFFF"><a href="http://youjia.chemcp.com/(.*?)" target="_blank">(.*?)</a></td>'
reg = re.compile(reg)
provices = re.findall(reg, htmlCode)
for pro in provices:
profile = pro[0]
profile = profile.replace("/","")
profile = dataDir +profile+".txt"
if not os.path.exists(profile):
proPrice = getProvince(pro)
proresult= {
"url":pro[0].replace("/",""),
"name":pro[1],
"_90":proPrice["_90"],
"_92":proPrice["_92"],
"_95":proPrice["_95"],
"_0":proPrice["_0"],
"data":proPrice["data"]
}
text = json.dumps(proresult)
f = open(profile,'w')
f.write(text)
f.close()
def getProvince(pro):
proUrl = pro[0]
htmlCode = getUrlContent('http://youjia.chemcp.com/'+proUrl)
reg = r'<font color="red">(.*?)元/升</font>'
reg = re.compile(reg)
provicePrice = re.findall(reg, htmlCode)
data = {
"_90":provicePrice[0],
"_92":provicePrice[1],
"_95":provicePrice[2],
"_0":provicePrice[3],
"data":[]
}
reg = r'<a href="/'+proUrl+'(.*?).html" target="_blank">(.*?)今日油价</a>'
reg = re.compile(reg)
cities = re.findall(reg, htmlCode)
for city in cities:
if not city[0] == "hamidiqu":
cityPrice = getCity({"curl":city[0],"purl":proUrl})
data["data"].append({
"url":city[0],
"name":city[1],
"_90":cityPrice["_90"],
"_92":cityPrice["_92"],
"_95":cityPrice["_95"],
"_0":cityPrice["_0"],
"data":cityPrice["data"]
})
return data
def getCity(url):
print("----------------------------------")
print("城市:"+url["curl"])
print("----------------------------------")
htmlCode = getUrlContent('http://youjia.chemcp.com/'+url['purl']+url['curl']+'.html')
reg = r'<font color="red">(.*?)元/升</font>'
reg = re.compile(reg)
cityPrice = re.findall(reg, htmlCode)
data = {
"_90":cityPrice[0],
"_92":cityPrice[1],
"_95":cityPrice[2],
"_0":cityPrice[3],
"data":[]
}
reg = r'<tdbgcolor="#FFFFFF">(.*?)今日油价</td><tdbgcolor="#FFFFFF">(.*?)</td><tdbgcolor="#FFFFFF">(.*?)</td><tdbgcolor="#FFFFFF">(.*?)</td><tdbgcolor="#FFFFFF">(.*?)</td>'
reg = re.compile(reg)
htmlCode = htmlCode.replace(' ' , '').replace("\r\n" , '')
areas = re.findall(reg,htmlCode)
for a in areas:
data["data"].append({
"url":a[0],
"name":a[0],
"_90":a[1],
"_92":a[2],
"_95":a[3],
"_0":a[4],
})
return data
def getUrlContent(url):
print(url)
page = urllib.request.urlopen(url,None,500)
htmlCode = page.read()
htmlCode = htmlCode.decode("gbk")
return htmlCode
def clawer():
okfile = logDir+"OK.txt"
if not os.path.exists(okfile):
getProvinceList()
print("获取 93,97 号省级油价")
getOtherOilType()
f = open(okfile,'w')
f.write("OK")
f.close()
if __name__ == '__main__':
try:
os.makedirs(logDir,775)
except:
print("日志目录已存在!")
clawer()
上效果图: