| 
终身高级VIP会员   
 
资源币44 积分233贡献0 在线时间22 小时注册时间2020-4-10最后登录2025-4-26 
 
 | 
 
| 本帖最后由 请我喝茶 于 2020-5-5 14:15 编辑 
 # -*- coding: utf-8 -*-
 
 # Author: 桑葚ICE
 # Email: 152516cc@gmail.com
 # Blog: iicey.github.io
 # JueJin: juejin.im/user/5c64dce8e51d45013c40742c
 import re
 import time
 
 import requests
 from scrapy import Selector
 
 
 class Spider:
 
 def __init__(self, un="", pwd="", cookie=""):
 self.un = un
 self.pwd = pwd
 self.cookies = {}
 self.headers = {"cookie": cookie}
 self.bd_url_info = {}
 
 def enter(self):
 params = {
 "mod": "logging",
 "action": "login",
 "loginsubmit": "yes",
 "infloat": "yes",
 "lssubmit": "yes",
 "inajax": "1"
 }
 data = {
 "fastloginfield": "username",
 "username": self.un,
 "password": self.pwd,
 "quickforward": "yes",
 "handlekey": "ls",
 }
 response = requests.post(
 'https://www.zygx8.com/member.php',
 headers=self.headers,
 params=params,
 data=data
 )
 self.cookies = requests.utils.dict_from_cookiejar(response.cookies)
 
 def fid_to_tid(self, fid, page=1, tid_s=None):
 """
 :param fid:
 :param page:
 :return:
 """
 params = (
 ('mod', 'forumdisplay'),
 ('fid', fid),
 ('page', page),
 ('t', '5104641'),
 )
 
 response = requests.get(
 'https://www.zygx8.com/forum.php',
 headers=self.headers,
 # cookies=self.cookies,
 params=params
 )
 ret = Selector(response)
 tid_l = ret.xpath('//*[@id="threadlisttableid"]/tbody[contains(@id,"normalthread")]/@id').extract()
 if not tid_s:
 tid_s = set()
 old_count = len(tid_s)
 for i in tid_l:
 tid_s.add(i.replace("normalthread_", ""))
 new_count = len(tid_s)
 if new_count != old_count:
 page += 1
 return self.fid_to_tid(fid, page=page, tid_s=tid_s)
 else:
 return tid_s
 
 def get_content(self, fid, tid):
 response = requests.get(
 f"https://www.zygx8.com/forum.php?mod=viewthread&tid={tid}&extra=page%3D1",
 headers=self.headers
 )
 if "如果您要查看本帖隐藏内容请" in response.text:
 self.post_content(fid, tid)
 time.sleep(60)
 return self.get_content(fid, tid)
 else:
 # time.sleep(1)
 result = Selector(response)
 url_info = result.xpath('//div[@class="showhide"]//text()').extract()
 try:
 bd_url = [i for i in url_info if "https://" in i][0]
 bd_pwd = re.findall(r"\w\w\w\w", [i for i in url_info if "提取码" in i][0])[0]
 
 print(bd_url, bd_pwd)
 self.bd_url_info[bd_url] = bd_pwd
 except IndexError as e:
 print(e, response.url)
 
 def post_content(self, fid, tid):
 """
 :param fid:
 :param tid:
 :return:
 """
 params = (
 ('mod', 'post'),
 ('infloat', 'yes'),
 ('action', 'reply'),
 ('fid', fid),
 ('extra', ''),
 ('tid', tid),
 ('replysubmit', 'yes'),
 ('inajax', '1'),
 )
 
 data = {
 'formhash': '78484d61',
 'handlekey': 'reply',
 'noticeauthor': '',
 'noticetrimstr': '',
 'noticeauthormsg': '',
 'usesig': '0',
 'subject': '',
 'message': '666'
 }
 
 response = requests.post('https://www.zygx8.com/forum.php',
 headers=self.headers, params=params, data=data)
 
 def main(self):
 pass
 
 
 if __name__ == '__main__':
 cookie = "浏览器F12打开取出cookie放到这里"
 spider = Spider(cookie=cookie)
 for fid in [158]:
 for tid in spider.fid_to_tid(fid):
 spider.get_content(fid, tid)
 print(spider.bd_url_info)
 
 | 
 |