s拉钩怎么用基于爬虫技术爬取拉勾网招聘信息分析软件开发行业走向

新闻资讯2026-04-21 00:46:10
  1 # !/usr/bin/env python3
  2 # _*_ coding:utf-8 _*_
  3 """
  4 @File : crawlsite.py
  5 @Project : coursedesign
  6 @Time : 2021/12/13 19:57
  7 @Author : TheNorth
  8 @Function :
  9 """
 10 from config import Config
 11 import requests
 12 from tools.log import logger
 13 from lxml import etree
 14 from service.datastore import DataStore
 15 class CrawlSite(object):
 16 """
 17 爬取拉钩网的核心代码
 18 """
 19 def __init__(self):
 20 """
 21 爬虫的相关参数的初始化
 22 :param
 23 """
 24 self.headers = Config.REQUEST_HEADERS
 25 self.baseurl = Config.BASE_URL
 26 self.logger = logger
 27 self.data_store = DataStore()
 28 self.city = ""
 29 self.world_cloud_content = ""
 30 def crawlTarget(self, query_content: str, page_num: int, city: str):
 31 """
 32 :param query_content: 要查询的内容
 33 :param page_num: 查询的页数
 34 :param city: 查询的城市
 35 :return:
 36 """
 37 target_url = f"{self.baseurl}?px=new&pn={page_num}&fromSearch=true&kd=
 38 {query_content}&city={city}"
 39 try:
 40 self.logger.info(f"开始请求")
 41 resp = requests.get(url=target_url, headers=self.headers, timeout=3)
 42 self.city = city
 43 if resp.status_code != 200:
 44 self.logger.warning(f"目标状态码错误,状态码为
 45 {resp.status_code}")
 46 else:
 47 self.logger.info(f"目标地址请求成功")
 48 self.logger.info(f"开始解析")
 49 result_list = self.parse_html(resp.text)
 50 self.logger.info(f"数据持久化")
 51 self.data_store.createTable(table_name=query_content)
 52 self.data_store.insertdata(result_list)
 53 except Exception as error:
 54 self.logger.error(f"爬虫爬取目标站点报错{error}")
 55 def parse_html(self, html_content) -> list:
 56 """
 57 使用xpath 解析html语法树
 58 :param html_content:
 59 :return 返回解析列表
 60 """
 61 try:
 62 html = etree.HTML(html_content)
 63 company_list = html.xpath(
 64 '//div[@class="content-left__31-
 65 g5"]//div[@class="list__YibNq"]//div[@class="item__10RTO"]'
 66 '//div[@class="itemtop__1Z3Zo"]//div[@class="company__2EsC8"]//div[@class="company-name__2-SjF"]'
 67 '//a/text()'
 68 )
 69 if not company_list:
 70 self.logger.warning("全部页数解析完毕")
 71 return []
 72 salary_list = html.xpath(
 73 '//div[@class="content-left__31-
 74 g5"]//div[@class="list__YibNq"]//div[@class="item__10RTO"]'
 75 '//div[2]//span[@class="money__3Lkgq"]/text()'
 76 )
 77 degree_raw_list = html.xpath(
 78 '//div[@class="content-left__31-
 79 g5"]//div[@class="list__YibNq"]//div[@class="item__10RTO"]'
 80 '//div[@class="p-bom__JlNur"]/text()'
 81 )
 82 technology_list = html.xpath(
 83 '//div[@class="content-left__31-
 84 g5"]//div[@class="list__YibNq"]//div[@class="item__10RTO"]'
 85 '//div[@class="itembom__cTJhu"]//div[@class="ir___QwEG"]//span/text()'
 86 )
 87 for technology in technology_list:
 88 self.world_cloud_content += technology + " "
 89 degree_list = []
 90 for degree_raw in degree_raw_list:
 91 degree = degree_raw.split("/")[-1]
 92 degree_list.append(degree)
 93 result_list = []
 94 for index in range(len(company_list)):
 95 result_dict = {}
 96 result_dict.update({"company": company_list[index].strip(' ')})
 97 result_dict.update({"salary": salary_list[index].strip(' ')})
 98 result_dict.update({"degree": degree_list[index].strip(' ')})
 99 result_dict.update({"city": self.city.strip(' ')})
100 result_list.append(result_dict)self.logger.info(company_list)
101 self.logger.info(salary_list)
102 self.logger.info(degree_list)
103 self.logger.info(result_list)
104 return result_list
105 except Exception as error:
106 self.logger.error(f"解析html文件报错,错误信息{error}")
107 return []
108 if __name__ == '__main__':
109 spider = CrawlSite()
110 spider.crawlTarget(query_content="java开发", page_num=2, city="北京")# !/usr/bin/env python3
111 # _*_ coding:utf-8 _*_
112 """
113 @File : dataanalyze.py
114 @Project : coursedesign
115 @Time : 2021/12/15 0:05
116 @Author : TheNorth
117 @Function :
118 """
119 import matplotlib.pyplot as plt
120 from service.datastore import DataStore
121 from tools.log import logger
122 class DataAnalyze(object):
123 """
124 数据图表分析
125 """
126 def __init__(self):
127 pass
128 def analyzeSalary(self):
129 """
130 :return:
131 """
132 datadao = DataStore()
133 city_list = ["北京", "上海", "深圳", "合肥", "武汉", "南京"]
134 aver_salary_list = []
135 for city in city_list:
136 salary_list = datadao.searchdata(query_column='salary',
137 city_name=city, table_name="python")
138 tmp_salary = 0
139 if len(salary_list) == 0:
140 logger.info(city)
141 for salary in salary_list:salary_str = salary[0].replace('k', '')
142 tmp_salary_list = salary_str.split('-')
143 tmp_salary += int(int(tmp_salary_list[0]) +
144 int(tmp_salary_list[1])) // 2
145 aver_salary_list.append(tmp_salary // len(salary_list))
146 x = ["北京", "上海", "深圳", "合肥", "武汉", "南京"]
147 y = aver_salary_list
148 plt.rcParams['font.sans-serif'] = ['FangSong']
149 plt.title('一、二线城市python开发薪资水平', fontsize=20, color='black') #
150 title默认不支持中文 写的话会出现乱码
151 plt.xlabel('城市名称', fontsize=20, color='black')
152 plt.ylabel('薪资水平', fontsize=20, color='black')
153 """
154 只要是设置字体那么都可以使用
155 fontsize
156 color
157 """
158 plt.plot(x, y)
159 plt.show()
160 def analyzeDegree(self):
161 datadao = DataStore()
162 try:
163 city_list = ["北京", "上海", "深圳", "合肥", "武汉", "南京"]
164 labels = '大专', '本科', '硕士'
165 label_data = [0, 0, 0]
166 for city in city_list:
167 degree_city_list = datadao.searchdata(query_column='degree',
168 table_name='python', city_name=city)
169 for degree in degree_city_list:
170 if degree[0] == "大专":
171 label_data[0] += 1
172 elif degree[0] == "本科":
173 label_data[1] += 1
174 elif degree[0] == "硕士":
175 label_data[2] += 1
176 print(label_data)
177 percent_list = []
178 sum_label = label_data[0] + label_data[1] + label_data[2]
179 for lable_num in label_data:
180 percent_list.append((lable_num / sum_label) * 100)
181 plt.rcParams['font.sans-serif'] = ['FangSong']
182 explode = [0.1, 0.1, 0.1]
183 plt.axes(aspect=1)
184 plt.pie(x=percent_list, labels=labels, autopct='%.0f%%',
185 explode=explode, shadow=True)
186 plt.show()
187 except Exception as error:
188 logger.error(f"查询学位字段出现错误{error}")
189 if __name__ == '__main__':
190 dataanlyze = DataAnalyze()
191 dataanlyze.analyzeDegree()
192 # !/usr/bin/env python3
193 # _*_ coding:utf-8 _*_
194 """
195 @File : datastore.py
196 @Project : coursedesign
197 @Time : 2021/12/16 22:57
198 @Author : TheNorth
199 @Function :
200 """
201 import sqlite3
202 from tools.log import logger
203 class DataStore(object):
204 """
205 数据持久化,使用sqlite数据库
206 """
207 def __init__(self):
208 # 可以指定创建数据库的路径,比如可以写成sqlite3.connect(r"E:DEMO.db")
209 try:
210 self.logger = logger
211 self.con =
212 sqlite3.connect("E:\pycharmproject\coursedesign\jobresult.db")
213 self.cur = self.con.cursor()
214 except Exception as error:
215 self.logger.error(f"连接数据库出现错误,错误信息为{error}")
216 def createTable(self, table_name: str):
217 """
218 创建相关的数据库表名
219 :param table_name: 数据表的表名
220 :return:
221 """
222 try:
223 sql = f"CREATE TABLE IF NOT EXISTS {table_name}(id INTEGER PRIMARY
224 KEY AUTOINCREMENT,company VARCHAR(255)," 
225 f"salary VARCHAR(255), degree VARCHAR(255), city VARCHAR(255)
226 )"
227 self.cur.execute(sql)
228 self.table_name = table_name
229 self.logger.info("创建数据表成功")
230 except Exception as error:
231 self.logger.warning("创建表出现错误")
232 def insertdata(self, result_list: list):
233 """
234 插入数据到数据表中
235 :param result_list:
236 :return:
237 """
238 for result_dict in result_list:
239 company = result_dict.get("company", " ")
240 salary = result_dict.get("salary", " ")
241 degree = result_dict.get("degree", " ")
242 city = result_dict.get("city", " ")
243 try:
244 self.cur.execute(f"INSERT INTO {self.table_name}
245 values(?,?,?,?,?)",
246 (None, company, salary, degree, city))
247 except Exception as error:
248 self.logger.warning(f"插入数据报错,报错信息{error}")
249 self.con.commit()
250 def searchdata(self, query_column: str, table_name: str, city_name: str =
251 None) -> list:
252 """
253 查询数据库的薪资数据
254 :param query_column: 查询的字段
255 :param city_name: 查询的城市名称
256 :param table_name: 查询的表名:
257 :return:
258 """
259 try:
260 self.cur.execute(f"select {query_column} from {table_name} where
261 city='{city_name}'")
262 res_list = self.cur.fetchall()
263 return res_list
264 # print(type(res), res)
265 except Exception as error:
266 self.logger.error(f"查询数据库出现错误,错误信息为{error}")
267 return []
268 if __name__ == '__main__':
269 test = DataStore()
270 test.createTable("java")
271 test.searchdata(query_column='salary', city_name="北京", table_name="java")
272 # !/usr/bin/env python3
273 # _*_ coding:utf-8 _*_
274 """
275 @File : worldcloudgene.py
276 @Project : coursedesign
277 @Time : 2021/12/23 23:35
278 @Author : TheNorth
279 @Function :
280 """
281 import wordcloud
282 class WorldCloudGene(object):
283 """
284 生成词云
285 """
286 def generateWorldCloud(self, world_cloud_content: str, png_name: str):
287 w = wordcloud.WordCloud(width=1000, height=700,
288 background_color='white', font_path='msyh.ttc')
289 # 调用词云对象的generate方法,将文本传入
290 w.generate(world_cloud_content)
291 # 将生成的词云保存为output2-poem.png图片文件,保存到当前文件夹中
292 w.to_file(f'{png_name}.png')
293 # !/usr/bin/env python3
294 # _*_ coding:utf-8 _*_
295 """
296 @File : config.py
297 @Project : coursedesign
298 @Time : 2021/12/30 19:51
299 @Author : TheNorth
300 @Function :
301 """
302 class Config(object):
303 """
304 爬虫的相关配置的定义
305 """
306 BASE_URL = "https://www.lagou.com/wn/jobs"
307 REQUEST_HEADERS = {
308 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0)
309 Gecko/20100101 Firefox/34.0',
310 'Accept': '*/*',
311 'Connection': 'keep-alive',
312 'Accept-Language': 'zh-CN,zh;q=0.8',
313 "Content-Type": "application/json"
314 }
315 # !/usr/bin/env python3
316 # _*_ coding:utf-8 _*_
317 """
318 @File : log.py
319 @Project : coursedesign
320 @Time : 2021/12/30 20:17
321 @Author : TheNorth
322 @Function :
323 """
324 import sys
325 import pathlib
326 from loguru import logger
327 # 路径设置
328 relative_directory = pathlib.Path(__file__).parent.parent # 代码相对路径
329 result_save_dir = relative_directory.joinpath('results') # 结果保存目录
330 log_path = result_save_dir.joinpath('crawlsite.log') # 爬虫爬取日志保存路径
331 # 日志配置
332 # 终端日志输出格式
333 stdout_fmt = '<cyan>{time:HH:mm:ss,SSS}</cyan> ' 
334 '[<level>{level: <5}</level>] ' 
335 '<blue>{module}</blue>:<cyan>{line}</cyan> - ' 
336 '<level>{message}</level>'
337 # 日志文件记录格式
338 logfile_fmt = '<light-green>{time:YYYY-MM-DD HH:mm:ss,SSS}</light-green> ' 
339 '[<level>{level: <5}</level>] ' 
340 '<cyan>{process.name}({process.id})</cyan>:' 
341 '<cyan>{thread.name: <18}({thread.id: <5})</cyan> | ' 
342 '<blue>{module}</blue>.<blue>{function}</blue>:' 
343 '<blue>{line}</blue> - <level>{message}</level>'
344 logger.remove()
345 logger.level(name='TRACE', color='<cyan><bold>', icon='')
346 logger.level(name='DEBUG', color='<blue><bold>', icon='🐞 ')
347 logger.level(name='INFOR', no=20, color='<green><bold>', icon='')
348 logger.level(name='QUITE', no=25, color='<green><bold>', icon='🤫 ')
349 logger.level(name='ALERT', no=30, color='<yellow><bold>', icon='')
350 logger.level(name='ERROR', color='<red><bold>', icon='')
351 logger.level(name='FATAL', no=50, color='<RED><bold>', icon='')
352 # 如果你想在命令终端静默运行datacollect,可以将以下一行中的level设置为QUITE
353 # 命令终端日志级别默认为INFOR
354 logger.add(sys.stderr, level='INFOR', format=stdout_fmt, enqueue=True)
355 # 日志文件默认为级别为DEBUG
356 logger.add(log_path, level='DEBUG', format=logfile_fmt, enqueue=True,
357 encoding='utf-8')