1 # !/usr/bin/env python3 2 # _*_ coding:utf-8 _*_ 3 """ 4 @File : crawlsite.py 5 @Project : coursedesign 6 @Time : 2021/12/13 19:57 7 @Author : TheNorth 8 @Function : 9 """ 10 from config import Config 11 import requests 12 from tools.log import logger 13 from lxml import etree 14 from service.datastore import DataStore 15 class CrawlSite(object): 16 """ 17 爬取拉钩网的核心代码 18 """ 19 def __init__(self): 20 """ 21 爬虫的相关参数的初始化 22 :param 23 """ 24 self.headers = Config.REQUEST_HEADERS 25 self.baseurl = Config.BASE_URL 26 self.logger = logger 27 self.data_store = DataStore() 28 self.city = "" 29 self.world_cloud_content = "" 30 def crawlTarget(self, query_content: str, page_num: int, city: str): 31 """ 32 :param query_content: 要查询的内容 33 :param page_num: 查询的页数 34 :param city: 查询的城市 35 :return: 36 """ 37 target_url = f"{self.baseurl}?px=new&pn={page_num}&fromSearch=true&kd= 38 {query_content}&city={city}" 39 try: 40 self.logger.info(f"开始请求") 41 resp = requests.get(url=target_url, headers=self.headers, timeout=3) 42 self.city = city 43 if resp.status_code != 200: 44 self.logger.warning(f"目标状态码错误,状态码为 45 {resp.status_code}") 46 else: 47 self.logger.info(f"目标地址请求成功") 48 self.logger.info(f"开始解析") 49 result_list = self.parse_html(resp.text) 50 self.logger.info(f"数据持久化") 51 self.data_store.createTable(table_name=query_content) 52 self.data_store.insertdata(result_list) 53 except Exception as error: 54 self.logger.error(f"爬虫爬取目标站点报错{error}") 55 def parse_html(self, html_content) -> list: 56 """ 57 使用xpath 解析html语法树 58 :param html_content: 59 :return 返回解析列表 60 """ 61 try: 62 html = etree.HTML(html_content) 63 company_list = html.xpath( 64 '//div[@class="content-left__31- 65 g5"]//div[@class="list__YibNq"]//div[@class="item__10RTO"]' 66 '//div[@class="itemtop__1Z3Zo"]//div[@class="company__2EsC8"]//div[@class="company-name__2-SjF"]' 67 '//a/text()' 68 ) 69 if not company_list: 70 self.logger.warning("全部页数解析完毕") 71 return [] 72 salary_list = html.xpath( 73 '//div[@class="content-left__31- 74 g5"]//div[@class="list__YibNq"]//div[@class="item__10RTO"]' 75 '//div[2]//span[@class="money__3Lkgq"]/text()' 76 ) 77 degree_raw_list = html.xpath( 78 '//div[@class="content-left__31- 79 g5"]//div[@class="list__YibNq"]//div[@class="item__10RTO"]' 80 '//div[@class="p-bom__JlNur"]/text()' 81 ) 82 technology_list = html.xpath( 83 '//div[@class="content-left__31- 84 g5"]//div[@class="list__YibNq"]//div[@class="item__10RTO"]' 85 '//div[@class="itembom__cTJhu"]//div[@class="ir___QwEG"]//span/text()' 86 ) 87 for technology in technology_list: 88 self.world_cloud_content += technology + " " 89 degree_list = [] 90 for degree_raw in degree_raw_list: 91 degree = degree_raw.split("/")[-1] 92 degree_list.append(degree) 93 result_list = [] 94 for index in range(len(company_list)): 95 result_dict = {} 96 result_dict.update({"company": company_list[index].strip(' ')}) 97 result_dict.update({"salary": salary_list[index].strip(' ')}) 98 result_dict.update({"degree": degree_list[index].strip(' ')}) 99 result_dict.update({"city": self.city.strip(' ')}) 100 result_list.append(result_dict)self.logger.info(company_list) 101 self.logger.info(salary_list) 102 self.logger.info(degree_list) 103 self.logger.info(result_list) 104 return result_list 105 except Exception as error: 106 self.logger.error(f"解析html文件报错,错误信息{error}") 107 return [] 108 if __name__ == '__main__': 109 spider = CrawlSite() 110 spider.crawlTarget(query_content="java开发", page_num=2, city="北京")# !/usr/bin/env python3 111 # _*_ coding:utf-8 _*_ 112 """ 113 @File : dataanalyze.py 114 @Project : coursedesign 115 @Time : 2021/12/15 0:05 116 @Author : TheNorth 117 @Function : 118 """ 119 import matplotlib.pyplot as plt 120 from service.datastore import DataStore 121 from tools.log import logger 122 class DataAnalyze(object): 123 """ 124 数据图表分析 125 """ 126 def __init__(self): 127 pass 128 def analyzeSalary(self): 129 """ 130 :return: 131 """ 132 datadao = DataStore() 133 city_list = ["北京", "上海", "深圳", "合肥", "武汉", "南京"] 134 aver_salary_list = [] 135 for city in city_list: 136 salary_list = datadao.searchdata(query_column='salary', 137 city_name=city, table_name="python") 138 tmp_salary = 0 139 if len(salary_list) == 0: 140 logger.info(city) 141 for salary in salary_list:salary_str = salary[0].replace('k', '') 142 tmp_salary_list = salary_str.split('-') 143 tmp_salary += int(int(tmp_salary_list[0]) + 144 int(tmp_salary_list[1])) // 2 145 aver_salary_list.append(tmp_salary // len(salary_list)) 146 x = ["北京", "上海", "深圳", "合肥", "武汉", "南京"] 147 y = aver_salary_list 148 plt.rcParams['font.sans-serif'] = ['FangSong'] 149 plt.title('一、二线城市python开发薪资水平', fontsize=20, color='black') # 150 title默认不支持中文 写的话会出现乱码 151 plt.xlabel('城市名称', fontsize=20, color='black') 152 plt.ylabel('薪资水平', fontsize=20, color='black') 153 """ 154 只要是设置字体那么都可以使用 155 fontsize 156 color 157 """ 158 plt.plot(x, y) 159 plt.show() 160 def analyzeDegree(self): 161 datadao = DataStore() 162 try: 163 city_list = ["北京", "上海", "深圳", "合肥", "武汉", "南京"] 164 labels = '大专', '本科', '硕士' 165 label_data = [0, 0, 0] 166 for city in city_list: 167 degree_city_list = datadao.searchdata(query_column='degree', 168 table_name='python', city_name=city) 169 for degree in degree_city_list: 170 if degree[0] == "大专": 171 label_data[0] += 1 172 elif degree[0] == "本科": 173 label_data[1] += 1 174 elif degree[0] == "硕士": 175 label_data[2] += 1 176 print(label_data) 177 percent_list = [] 178 sum_label = label_data[0] + label_data[1] + label_data[2] 179 for lable_num in label_data: 180 percent_list.append((lable_num / sum_label) * 100) 181 plt.rcParams['font.sans-serif'] = ['FangSong'] 182 explode = [0.1, 0.1, 0.1] 183 plt.axes(aspect=1) 184 plt.pie(x=percent_list, labels=labels, autopct='%.0f%%', 185 explode=explode, shadow=True) 186 plt.show() 187 except Exception as error: 188 logger.error(f"查询学位字段出现错误{error}") 189 if __name__ == '__main__': 190 dataanlyze = DataAnalyze() 191 dataanlyze.analyzeDegree() 192 # !/usr/bin/env python3 193 # _*_ coding:utf-8 _*_ 194 """ 195 @File : datastore.py 196 @Project : coursedesign 197 @Time : 2021/12/16 22:57 198 @Author : TheNorth 199 @Function : 200 """ 201 import sqlite3 202 from tools.log import logger 203 class DataStore(object): 204 """ 205 数据持久化,使用sqlite数据库 206 """ 207 def __init__(self): 208 # 可以指定创建数据库的路径,比如可以写成sqlite3.connect(r"E:DEMO.db") 209 try: 210 self.logger = logger 211 self.con = 212 sqlite3.connect("E:\pycharmproject\coursedesign\jobresult.db") 213 self.cur = self.con.cursor() 214 except Exception as error: 215 self.logger.error(f"连接数据库出现错误,错误信息为{error}") 216 def createTable(self, table_name: str): 217 """ 218 创建相关的数据库表名 219 :param table_name: 数据表的表名 220 :return: 221 """ 222 try: 223 sql = f"CREATE TABLE IF NOT EXISTS {table_name}(id INTEGER PRIMARY 224 KEY AUTOINCREMENT,company VARCHAR(255)," 225 f"salary VARCHAR(255), degree VARCHAR(255), city VARCHAR(255) 226 )" 227 self.cur.execute(sql) 228 self.table_name = table_name 229 self.logger.info("创建数据表成功") 230 except Exception as error: 231 self.logger.warning("创建表出现错误") 232 def insertdata(self, result_list: list): 233 """ 234 插入数据到数据表中 235 :param result_list: 236 :return: 237 """ 238 for result_dict in result_list: 239 company = result_dict.get("company", " ") 240 salary = result_dict.get("salary", " ") 241 degree = result_dict.get("degree", " ") 242 city = result_dict.get("city", " ") 243 try: 244 self.cur.execute(f"INSERT INTO {self.table_name} 245 values(?,?,?,?,?)", 246 (None, company, salary, degree, city)) 247 except Exception as error: 248 self.logger.warning(f"插入数据报错,报错信息{error}") 249 self.con.commit() 250 def searchdata(self, query_column: str, table_name: str, city_name: str = 251 None) -> list: 252 """ 253 查询数据库的薪资数据 254 :param query_column: 查询的字段 255 :param city_name: 查询的城市名称 256 :param table_name: 查询的表名: 257 :return: 258 """ 259 try: 260 self.cur.execute(f"select {query_column} from {table_name} where 261 city='{city_name}'") 262 res_list = self.cur.fetchall() 263 return res_list 264 # print(type(res), res) 265 except Exception as error: 266 self.logger.error(f"查询数据库出现错误,错误信息为{error}") 267 return [] 268 if __name__ == '__main__': 269 test = DataStore() 270 test.createTable("java") 271 test.searchdata(query_column='salary', city_name="北京", table_name="java") 272 # !/usr/bin/env python3 273 # _*_ coding:utf-8 _*_ 274 """ 275 @File : worldcloudgene.py 276 @Project : coursedesign 277 @Time : 2021/12/23 23:35 278 @Author : TheNorth 279 @Function : 280 """ 281 import wordcloud 282 class WorldCloudGene(object): 283 """ 284 生成词云 285 """ 286 def generateWorldCloud(self, world_cloud_content: str, png_name: str): 287 w = wordcloud.WordCloud(width=1000, height=700, 288 background_color='white', font_path='msyh.ttc') 289 # 调用词云对象的generate方法,将文本传入 290 w.generate(world_cloud_content) 291 # 将生成的词云保存为output2-poem.png图片文件,保存到当前文件夹中 292 w.to_file(f'{png_name}.png') 293 # !/usr/bin/env python3 294 # _*_ coding:utf-8 _*_ 295 """ 296 @File : config.py 297 @Project : coursedesign 298 @Time : 2021/12/30 19:51 299 @Author : TheNorth 300 @Function : 301 """ 302 class Config(object): 303 """ 304 爬虫的相关配置的定义 305 """ 306 BASE_URL = "https://www.lagou.com/wn/jobs" 307 REQUEST_HEADERS = { 308 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) 309 Gecko/20100101 Firefox/34.0', 310 'Accept': '*/*', 311 'Connection': 'keep-alive', 312 'Accept-Language': 'zh-CN,zh;q=0.8', 313 "Content-Type": "application/json" 314 } 315 # !/usr/bin/env python3 316 # _*_ coding:utf-8 _*_ 317 """ 318 @File : log.py 319 @Project : coursedesign 320 @Time : 2021/12/30 20:17 321 @Author : TheNorth 322 @Function : 323 """ 324 import sys 325 import pathlib 326 from loguru import logger 327 # 路径设置 328 relative_directory = pathlib.Path(__file__).parent.parent # 代码相对路径 329 result_save_dir = relative_directory.joinpath('results') # 结果保存目录 330 log_path = result_save_dir.joinpath('crawlsite.log') # 爬虫爬取日志保存路径 331 # 日志配置 332 # 终端日志输出格式 333 stdout_fmt = '<cyan>{time:HH:mm:ss,SSS}</cyan> ' 334 '[<level>{level: <5}</level>] ' 335 '<blue>{module}</blue>:<cyan>{line}</cyan> - ' 336 '<level>{message}</level>' 337 # 日志文件记录格式 338 logfile_fmt = '<light-green>{time:YYYY-MM-DD HH:mm:ss,SSS}</light-green> ' 339 '[<level>{level: <5}</level>] ' 340 '<cyan>{process.name}({process.id})</cyan>:' 341 '<cyan>{thread.name: <18}({thread.id: <5})</cyan> | ' 342 '<blue>{module}</blue>.<blue>{function}</blue>:' 343 '<blue>{line}</blue> - <level>{message}</level>' 344 logger.remove() 345 logger.level(name='TRACE', color='<cyan><bold>', icon='✏') 346 logger.level(name='DEBUG', color='<blue><bold>', icon='🐞 ') 347 logger.level(name='INFOR', no=20, color='<green><bold>', icon='ℹ') 348 logger.level(name='QUITE', no=25, color='<green><bold>', icon='🤫 ') 349 logger.level(name='ALERT', no=30, color='<yellow><bold>', icon='⚠') 350 logger.level(name='ERROR', color='<red><bold>', icon='❌') 351 logger.level(name='FATAL', no=50, color='<RED><bold>', icon='☠') 352 # 如果你想在命令终端静默运行datacollect,可以将以下一行中的level设置为QUITE 353 # 命令终端日志级别默认为INFOR 354 logger.add(sys.stderr, level='INFOR', format=stdout_fmt, enqueue=True) 355 # 日志文件默认为级别为DEBUG 356 logger.add(log_path, level='DEBUG', format=logfile_fmt, enqueue=True, 357 encoding='utf-8')