这是上学吧的弟弟——赏学吧的纯问题爬取,之前有人发过答案接口,不过已经失效了。我爬的差不多了。。
import requests
import pymysql
from bs4 import BeautifulSoup
import time
import json
import re
import threading
def TODB(id, question, answer):
    conn = pymysql.connect(host='localhost',  # 连接名称,默认127.0.0.1 ,
                           user='shangxueba',  # 用户名,
                           passwd='shangxueba',  # 密码,
                           port=3306,  # 端口,默认为3306,
                           db='shangxueba',  # 数据库名称,
                           charset='utf8'  # 字符编码
                           )
    cur = conn.cursor()
    sql = "INSERT INTO disanfang (id, question,answer) VALUES (%s, %s,%s)"
    val = (id, question, answer)
    cur.execute(sql, val)
    conn.close()


headers = {

    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3760.400 QQBrowser/10.5.4083.400"

}
count = 0
quedes = "【题目描述】"
wenti404 = "<script>alert('该问题不存在或未审核!');</script><script>location.href='https://www.shangxueba.cn/about/err_404_403.aspx';</script>"
# 共2683074道题目 从376开始
for nownum in range(163940, 2683074, 2):
    # 起始id ,最终id,中间隔几个?
    # 这里就是从id为163940开始 到2683074结束,每两个读一次
    # 本来应该写多线程的,但我不会。。。
    # 所以直接写成这样,直接多开
    # 从477开始 是规律的
    i = 0
    # content = ""
    url = "https://www.shangxueba.cn/"+str(nownum)+".html"
    html = requests.get(url, headers=headers)
    # 判断页面是不是404
    if(html.text != wenti404):
        # html格式化
        htmlsoup = BeautifulSoup(html.text, 'html.parser')
        # 将问题部分格式化
        qlist = htmlsoup.find_all("p")
        # 判断是否带【题目描述】
        if(qlist[i].string == quedes):
            question = qlist[i+1].string
        else:
            question = qlist[i].string
        answer = ''
        TODB(nownum, question, answer)
        # 间隔时间 单位为秒
        time.sleep(0.5)
        print("当前id为"+str(nownum)+"是第"+str(count)+"个")
        count = count+1
    else:
        print("界面404,下一个")

作者:晚空,如若转载,请注明出处:《上学吧爬取问题——纯问题爬取-py》