实例介绍
【实例简介】
爬取天气网数据 显示登陆,爬虫界面并对数据进行统计
【实例截图】
【核心代码】
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 | # -*- coding:utf-8 -*- import requests import mysql.connector from bs4 import BeautifulSoup import queue, json import matplotlib.pyplot as plt import numpy as np import time from threading import Thread from pyquery import PyQuery as pq import csv from tkinter import * import tkinter as tk start = '' class TianQiSpider( object ): def __init__( self ): self .headers = { 'User-Agent' : 'Mozilla/5.0 (Linux; Android 8.0.0; Nexus 6P Build/OPP3.170518.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Mobile Safari/537.36' } # 构建队列,供线程之间传输数据使用 self .url_queue = queue.Queue() self .html_queue = queue.Queue() self .data_queue = queue.Queue() def get_url( self ): print ( '数据采集中' ) # 发起请求,获取所有城市目标的url地址,构建url列表 html = requests.get(url = self .start_url, headers = self .headers).content.decode() soup = BeautifulSoup(html, 'html5lib' ) # 获取所有的url地址 div = soup.find(name = 'div' , class_ = 'lqcontentBoxheader' ) a_list = div.find_all(name = 'a' ) for a in a_list: def send_requests( self ): '''发起请求获取页面数据''' # 从队列中读取url地址 while self .url_queue.qsize(): url = self .url_queue.get() # 发送请求 获取响应 response = requests.get(url = url, headers = self .headers) # 获取html页面 html = response.content.decode() # print(html) # 将html页面加入队列中 self .html_queue.put(html) # 通知url队列,获取出来的数据使用完毕 self .url_queue.task_done() def get_data( self ): '''使用bs4提取页面数据''' while self .html_queue.qsize() or self .url_queue.qsize(): html = self .html_queue.get() soup = BeautifulSoup(html, 'html5lib' ) # table= soup.find(name='div',class_="conMidtab").find_all(name = 'table')[1] # #获取当前地区,所有的城市 # tr_list = table.find_all(name='tr') tr_list = soup.find(name = 'div' , class_ = "conMidtab" ).find_all(name = 'tr' )[ 2 :] # 遍历,获取每个城市的数据 for tr in tr_list: city = tr.find(name = 'td' ).get_text().strip() tian = tr.find_all(name = 'td' )[ - 4 ].get_text() wind = tr.find_all(name = 'td' )[ - 3 ].get_text().split() wind = ' ' .join(wind) max = tr.find_all(name = 'td' )[ - 5 ].get_text() min = tr.find_all(name = 'td' )[ - 2 ].string data = dict ( city = city, tian = tian, wind = wind, max = max , min = min ) self .data_queue.put(data) self .html_queue.task_done() def save_data( self ): # '''存储数据''' # 从队列中读取数据 data_num = 0 while self .url_queue.qsize() or self .data_queue.qsize() or self .html_queue.qsize(): data = self .data_queue.get() # 转换成json类型的数据 json_data = json.dumps(data, ensure_ascii = False ) # 写入文件中 if data_num = = 0 : with open ( 'tian.json' , 'w' , encoding = 'utf8' ) as f: f.write(json_data '\n' ) data_num = 1 self .data_queue.task_done() else : with open ( 'tian.json' , 'a' , encoding = 'utf8' ) as f: f.write(json_data '\n' ) data_num = 1 self .data_queue.task_done() with open ( 'log.txt' , 'w' , encoding = 'utf8' ) as f: f.write( "爬取了%d条数据\n" % (data_num)) print ( '数据采集完成' ) print ( "爬取了%d条数据" % (data_num)) def run( self ): # 创建线程列表 thead_list = [] start = time.time() self .get_url() for i in range ( 5 ): thead_list.append(Thread(target = self .send_requests)) for i in range ( 1 ): thead_list.append(Thread(target = self .get_data)) for i in range ( 1 ): thead_list.append(Thread(target = self .save_data)) for t in thead_list: t.start() for t in thead_list: t.join() with open ( 'log.txt' , 'a' , encoding = 'utf8' ) as f: f.write( "本次数据采集用时%d秒\n" % (time.time() - start)) f.write( "采用了%d个线程数\n" % ( len (thead_list))) print ( "本次数据采集用时%d秒" % (time.time() - start)) print ( "采用了%d个线程数" % ( len (thead_list))) def crawl(): url = url_input.get() headers = { 'user-agent' : 'Mozilla/5.0 (Linux; Android 8.0.0; Nexus 6P Build/OPP3.170518.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Mobile Safari/537.36' } file = csv.writer( open ( '天气.csv' , 'w' )) print ( '在解析网址中:' , url) html = requests.get(url = url, headers = headers).content.decode() soup = BeautifulSoup(html, 'html5lib' ) # #获取当前地区,所有的城市 tr_list = soup.find(name = 'div' , class_ = "conMidtab" ).find_all(name = 'tr' )[ 0 :] provinces = soup.find(name = 'div' , class_ = "conMidtab" ).find_all(name = 'td' , class_ = "rowsPan" )[ 0 :] province_num = 0 ; for tr in tr_list: city = tr.find(name = 'td' ).get_text().strip() tian = tr.find_all(name = 'td' )[ - 4 ].get_text() wind = tr.find_all(name = 'td' )[ - 3 ].get_text().split() wind = ' ' .join(wind) max = tr.find_all(name = 'td' )[ - 5 ].get_text() min = tr.find_all(name = 'td' )[ - 2 ].string if city = = '省/直辖市' : province = provinces[province_num].find(name = 'a' ).get_text().strip() text.insert(END, province) province_num = 1 continue elif city = = '天气现象' : continue # 添加数据 text.insert(END, "城市:" city " 天气现象:" tian " 风力风向:" wind " 最高温度" max "℃ 最低温度" min "℃" ) # 文本框向下滚动 text.see(END) # 更新 text.update() print ( '已抓取完毕' ) class Login( object ): def __init__( self ): # 创建主窗口,用于容纳其它组件 self .root = tk.Tk() # 给主窗口设置标题内容 self .root.title( "测试天气网" ) self .root.geometry( '450x300' ) #创建一个`label`名为`Account: ` self .label_account = tk.Label( self .root, text = 'Account: ' ) #创建一个`label`名为`Password: ` self .label_password = tk.Label( self .root, text = 'Password: ' ) # 创建一个账号输入框,并设置尺寸 self .input_account = tk.Entry( self .root, width = 30 ) # 创建一个密码输入框,并设置尺寸 self .input_password = tk.Entry( self .root, show = '*' , width = 30 ) #创建一个登录系统的按钮 self .login_button = tk.Button( self .root, command = self .backstage_interface, text = "Login" , width = 10 ) # 创建一个注册系统的按钮 self .siginUp_button = tk.Button( self .root, command = self .siginUp_interface, text = "Sign up" , width = 10 ) # 完成布局 def gui_arrang( self ): self .label_account.place(x = 60 , y = 170 ) self .label_password.place(x = 60 , y = 195 ) self .input_account.place(x = 135 , y = 170 ) self .input_password.place(x = 135 , y = 195 ) self .login_button.place(x = 140 , y = 235 ) self .siginUp_button.place(x = 240 , y = 235 ) #进入注册界面 def siginUp_interface( self ): # self.root.destroy() tk.messagebox.showinfo(title = ' ', message=' 进入注册界面') # 进行登录信息验证 def backstage_interface( self ): account = self .input_account.get().ljust( 10 , " " ).strip() password = self .input_password.get().ljust( 10 , " " ).strip() #对账户信息进行验证,普通用户返回user,管理员返回master,账户错误返回noAccount,密码错误返回noPassword if account = = 'admin' and password = = '123456' : print ( "登陆成功" ) self .root.destroy(); start = time.time() tianqi = TianQiSpider() tianqi.run() # 连接数据库 conn = mysql.connector.connect(user = 'root' , password = ' ', host=' 127.0 . 0.1 ', database=' testPython') cursor = conn.cursor() # 创建user表: cursor.execute( 'create table weather (id varchar(20) primary key, city varchar(40),tian varchar(20), wind varchar(40), max varchar(20), min varchar(20))' ) plt.rcParams[ 'font.sans-serif' ] = [ 'SimHei' ] # 用来正常显示中文标签 plt.rcParams[ 'axes.unicode_minus' ] = False # 用来正常显示负号 datas = [] f = open ( "tian.json" , 'r' , encoding = 'utf-8' ) ln = 0 for line in f.readlines(): dic = json.loads(line) datas.append(dic) sql = "insert into weather (id, city , tian, wind, max, min) values ('%s', '%s', '%s ', '%s', '%s', '%s')" % (ln 1 , datas[ln][ 'city' ], datas[ln][ 'tian' ], datas[ln][ 'wind' ],datas[ln][ 'max' ], datas[ln][ 'min' ]) cursor.execute(sql) # cursor.execute('insert into weather(id, city, max, min) values ("%s","%s","%s","%s")'%('null',city[ln],maxtemp[ln],temp[ln])) cursor.rowcount 1 # 提交事务: conn.commit() ln = 1 cursor.close() conn.close() with open ( 'log.txt' , 'a' , encoding = 'utf8' ) as f: f.write( "总共用时%d秒" % (time.time() - start)) print ( "总共用时%d秒" % (time.time() - start)) city = [] temp = [] maxtemp = [] tempNum1 = 0 tempNum2 = 0 tempNum3 = 0 tempNum4 = 0 for data in datas: data[ 'min' ] = int (data[ 'min' ] ) if data[ 'min' ] < = 30 and data[ 'min' ] > 20 : tempNum3 = 1 elif data[ 'min' ] < = 20 and data[ 'min' ] > 10 : tempNum2 = 1 elif data[ 'min' ] < = 10 : tempNum1 = 1 else : tempNum4 = 1 t = np.zeros( 3 ) for data in datas: if data[ 'tian' ] = = '多云' or data[ 'tian' ] = = '阴' : t[ 0 ] = 1 elif data[ 'tian' ] = = '晴' : t[ 1 ] = 1 else : t[ 2 ] = 1 datas = datas[ 0 : 10 ] ln = 0 for data in datas: city.append(data[ 'city' ]) temp.append( int (data[ 'min' ])) data[ 'min' ] = int (data[ 'min' ] ) if data[ 'max' ] = = '-' : maxtemp.append( int ( 0 )) else : maxtemp.append( int (data[ 'max' ])) global url_input, text # 创建空白窗口,作为主载体 root = Tk() root.title( '测试——天气' ) # 窗口的大小,后面的加号是窗口在整个屏幕的位置 root.geometry( '550x400 398 279' ) # 标签控件,窗口中放置文本组件 Label(root, text = '请输入测试的url:' , font = ( "华文行楷" , 20 ), fg = 'black' ).grid() # 定位 pack包 place位置 grid是网格式的布局 # Entry是可输入文本框 url_input = Entry(root, font = ( "微软雅黑" , 15 )) url_input.grid(row = 0 , column = 1 ) Label(root, text = ' ', font=("微软雅黑", 10), fg=' black').grid(row = 1 ) # 列表控件 text = Listbox(root, font = ( '微软雅黑' , 15 ), width = 45 , height = 10 ) # columnspan 组件所跨越的列数 text.grid(row = 2 , columnspan = 2 ) # 设置按钮 sticky对齐方式,N S W E button = Button(root, text = '开始测试' , font = ( "微软雅黑" , 15 ), command = crawl).grid(row = 3 , column = 0 , sticky = W) button = Button(root, text = '退出' , font = ( "微软雅黑" , 15 ), command = root.quit).grid(row = 3 , column = 1 , sticky = E) # 使得窗口一直存在 mainloop() plt.figure() plt.title( '全国城市气温统计' ) a = np.array(temp) b = np.array(maxtemp) plt.bar( range ( len (temp)), a, label = '最低气温' , tick_label = city) plt.bar( range ( len (temp)), b, bottom = a, label = '最高气温' , tick_label = city) plt.grid( True ) plt.figure() plt.title( '最低气温范围统计' ) labels = '10℃以下' , '10℃~20℃' , '20℃~30℃' , '30℃以上' sizes = tempNum1, tempNum2, tempNum3, tempNum4 colors = 'lightgreen' , 'gold' , 'lightskyblue' , 'lightcoral' explode = 0 , 0 , 0 , 0 plt.pie(sizes, explode = explode, labels = labels,colors = colors, autopct = '%1.1f%%' , shadow = True , startangle = 50 ) plt.axis( 'equal' ) plt.grid( True ) plt.figure() plt.title( '全国天气统计' ) labels = '阴' , '晴' , '有雨' sizes = t[ 0 ], t[ 1 ], t[ 2 ] colors = 'lightgreen' , 'gold' , 'lightskyblue' explode = 0 , 0 , 0 plt.pie(sizes, explode = explode, labels = labels, colors = colors, autopct = '%1.1f%%' , shadow = True , startangle = 50 ) plt.axis( 'equal' ) plt.grid( True ) plt.legend() plt.show() else : print ( '登陆失败,用户名或密码不正确' ) if __name__ = = '__main__' : # 初始化对象 L = Login() # 进行布局 L.gui_arrang() # 主程序执行 tk.mainloop() |
相关软件
网友评论
小贴士
感谢您为本站写下的评论,您的评论对其它用户来说具有重要的参考价值,所以请认真填写。
- 类似“顶”、“沙发”之类没有营养的文字,对勤劳贡献的楼主来说是令人沮丧的反馈信息。
- 相信您也不想看到一排文字/表情墙,所以请不要反馈意义不大的重复字符,也请尽量不要纯表情的回复。
- 提问之前请再仔细看一遍楼主的说明,或许是您遗漏了。
- 请勿到处挖坑绊人、招贴广告。既占空间让人厌烦,又没人会搭理,于人于己都无利。
关于好例子网
本站旨在为广大IT学习爱好者提供一个非营利性互相学习交流分享平台。本站所有资源都可以被免费获取学习研究。本站资源来自网友分享,对搜索内容的合法性不具有预见性、识别性、控制性,仅供学习研究,请务必在下载后24小时内给予删除,不得用于其他任何用途,否则后果自负。基于互联网的特殊性,平台无法对用户传输的作品、信息、内容的权属或合法性、安全性、合规性、真实性、科学性、完整权、有效性等进行实质审查;无论平台是否已进行审查,用户均应自行承担因其传输的作品、信息、内容而可能或已经产生的侵权或权属纠纷等法律责任。本站所有资源不代表本站的观点或立场,基于网友分享,根据中国法律《信息网络传播权保护条例》第二十二与二十三条之规定,若资源存在侵权或相关问题请联系本站客服人员,点此联系我们。关于更多版权及免责申明参见 版权及免责申明
支持(0) 盖楼(回复)
支持(0) 盖楼(回复)
支持(0) 盖楼(回复)