这是上学吧的弟弟——赏学吧的纯问题爬取,之前有人发过答案接口,不过已经失效了。我爬的差不多了。。
import requests import pymysql from bs4 import BeautifulSoup import time import json import re import threading
def TODB(id, question, answer): conn = pymysql.connect(host='localhost', # 连接名称,默认127.0.0.1 , user='shangxueba', # 用户名, passwd='shangxueba', # 密码, port=3306, # 端口,默认为3306, db='shangxueba', # 数据库名称, charset='utf8' # 字符编码 ) cur = conn.cursor() sql = "INSERT INTO disanfang (id, question,answer) VALUES (%s, %s,%s)" val = (id, question, answer) cur.execute(sql, val) conn.close() headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3760.400 QQBrowser/10.5.4083.400" } count = 0 quedes = "【题目描述】" wenti404 = "<script>alert('该问题不存在或未审核!');</script><script>location.href='https://www.shangxueba.cn/about/err_404_403.aspx';</script>" # 共2683074道题目 从376开始 for nownum in range(163940, 2683074, 2): # 起始id ,最终id,中间隔几个? # 这里就是从id为163940开始 到2683074结束,每两个读一次 # 本来应该写多线程的,但我不会。。。 # 所以直接写成这样,直接多开 # 从477开始 是规律的 i = 0 # content = "" url = "https://www.shangxueba.cn/"+str(nownum)+".html" html = requests.get(url, headers=headers) # 判断页面是不是404 if(html.text != wenti404): # html格式化 htmlsoup = BeautifulSoup(html.text, 'html.parser') # 将问题部分格式化 qlist = htmlsoup.find_all("p") # 判断是否带【题目描述】 if(qlist[i].string == quedes): question = qlist[i+1].string else: question = qlist[i].string answer = '' TODB(nownum, question, answer) # 间隔时间 单位为秒 time.sleep(0.5) print("当前id为"+str(nownum)+"是第"+str(count)+"个") count = count+1 else: print("界面404,下一个")
作者:晚空,如若转载,请注明出处:《上学吧爬取问题——纯问题爬取-py》