import requests from pyquery import PyQuery as pq import uuid
headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36', 'cookie': 'UM_distinctid=170a5a00fa25bf-075185606c88b7-396d7407-100200-170a5a00fa3507; CNZZDATA1274895726=1196969733-1583323670-%7C1583925652; Hm_lvt_45e50d2aec057f43a3112beaf7f00179=1583326696,1583756661,1583926583; Hm_lpvt_45e50d2aec057f43a3112beaf7f00179=1583926583' } defsaveImage(imgUrl,name): imgResponse = requests.get(imgUrl) fileName = "学习文件/%s.jpg" % name if imgResponse.status_code == 200: with open(fileName, 'wb') as f: f.write(imgResponse.content) f.close()
defgetPic(urlArray): for url in urlArray: res = requests.get(url,headers=headers) if res.status_code == 200: doc = pq(res.text) imgSrc = doc('.info-pic-list > a > img').attr('src') print(imgSrc) saveImage(imgSrc,uuid.uuid1().hex)
defcreateUrl(indexUrl,allPage): baseUrl = indexUrl.split('.html')[0] urlArray = [] for i in range(1,allPage): tempUrl = baseUrl+"_"+str(i)+".html" urlArray.append(tempUrl) return urlArray
defmain(): baseUrl = "https://www.nanrentu.cc/sgtp/" response = requests.get(baseUrl,headers=headers) if response.status_code == 200: with open("index.html",'w',encoding="utf-8") as f: f.write(response.text) doc = pq(response.text) # 得到所有图集的标题连接 titleLinks = doc('.h-piclist > li > a').items() # 遍历这些连接 for link in titleLinks: getBoys(link)