1 # _*_ coding:utf-8 _*_ 2 3 import requests 4 from lxml import etree 5 import json 6 7 8 def htmlpage(start_url): 9 """10 根据输入的职位请求相关页面,调用单个页面解析函数,同时调用写入函数11 :param start_url: 12 :return: 13 """14 headers={ 'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}15 r=requests.get(start_url)16 html=r.text17 content = etree.HTML(html)18 links=content.xpath('//div[@class="names cutom_font"]/a/@href')19 for link in links:20 full_url="https://www.shixiseng.com"+link21 pageparse(full_url)22 23 def pageparse(full_url):24 """25 单个页面解析函数,爬取职位名称和相应公司名称。26 :param full_url: 27 :return: 28 """29 headers = {30 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}31 r = requests.get(full_url)32 html = r.text33 content = etree.HTML(html)34 contents={}35 jobname=content.xpath('//div[@class="new_job_name"]/@title')36 #jobdetail=content.xpath('//div[@class="job_detail"]//div/text()')37 #jobdetails=jobdetail.split(',')38 comname=content.xpath('//div[@class="job_com_name cutom_font"]/text()')39 content={40 "jobname":jobname,41 #"jobdetail":jobdetail,42 "comname":comname43 }44 writecontent(content)45 46 def writecontent(content):47 """48 把爬取的职位信息写入本地49 :param content: 50 :return: 51 """52 with open("shixi.json","a") as f:53 f.write(json.dumps(content,ensure_ascii = False)+"\n")54 55 def main(base_url,begain,end):56 """57 调度函数58 :param base_url: 59 :param begain: 60 :param end: 61 :return: 62 """63 for page in range(begain,end):64 start_url=base_url+"&p="+str(page)65 htmlpage(start_url)66 67 if __name__=="__main__":68 key=input("job:") #用户输入职位名称69 begain=int(input("start:")) #爬取的初始页码70 end=int(input("end:")) #爬取的结束页码71 url="https://www.shixiseng.com/interns?k=" #搜索页码72 base_url=url+key #启始页码73 main(base_url,begain,end) #调用主函数
结果
{ "jobname": ["爬虫实习"], "comname": ["宜信"]}{ "jobname": ["数据挖掘研究实习生(爬虫方向)"], "comname": ["网易游戏"]}{ "jobname": ["金融 Java-爬虫方向实习生(广州)"], "comname": ["唯品会"]}{ "jobname": ["爬虫工程师"], "comname": ["比地科技"]}{ "jobname": ["爬虫工程师"], "comname": ["踪履"]}{ "jobname": ["Java爬虫/数据采集工程师实习"], "comname": ["搜狐"]}{ "jobname": ["爬虫实习生"], "comname": ["地平线机器人"]}{ "jobname": ["Java开发实习生-爬虫开发"], "comname": ["京东金融"]}{ "jobname": ["爬虫工程师"], "comname": ["指食针"]}{ "jobname": ["爬虫实习生"], "comname": ["同花顺"]}{ "jobname": ["爬虫工程师"], "comname": ["TransferEasy"]}{ "jobname": ["数据采集(爬虫)工程师"], "comname": ["乐职网"]}{ "jobname": ["爬虫工程师"], "comname": ["探迹"]}{ "jobname": ["爬虫开发实习生"], "comname": ["妙计旅行"]}{ "jobname": ["网络爬虫实习生"], "comname": ["海天瑞声"]}{ "jobname": ["爬虫实习生"], "comname": ["阿博茨"]}{ "jobname": ["爬虫工程师实习生"], "comname": ["阿博茨"]}{ "jobname": ["助理爬虫工程师"], "comname": ["有数金服"]}{ "jobname": ["数据采集/爬虫工程师/软件工程师"], "comname": ["上海中估联"]}{ "jobname": ["网页爬虫"], "comname": ["赛迪技术"]}{ "jobname": ["爬虫实习生"], "comname": ["阿博茨"]}{ "jobname": ["JavaEE爬虫数据实习生"], "comname": ["行圆汽车"]}{ "jobname": ["Python爬虫数据实习生"], "comname": ["行圆汽车"]}{ "jobname": ["Python爬虫实习生"], "comname": ["商智通"]}{ "jobname": ["搜狐爬虫开发实习生(python)"], "comname": ["搜狐媒体"]}{ "jobname": ["爬虫开发实习生"], "comname": ["北京阿博茨"]}{ "jobname": ["爬虫开发实习生"], "comname": ["勤智数码"]}{ "jobname": ["爬虫系统工程师(实习)"], "comname": ["爱奇艺"]}