一个模拟真实登录案例
import time
import requests
from bs4 import BeautifulSoup
def captcha(data):
with open("captcha.png", "wb") as f:
f.write(data)
# 返回输入的验证码字符串
return input("请输入验证码:")
def login():
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"}
ssion = requests.session()
# 获取登录页面的数据, URL地址不可变
html = ssion.get("https://www.zhihu.com/", headers=headers).content
# 提取html文件,指定lxml文件解析器
soup = BeautifulSoup(html, "lxml")
# 提取_xsrf值
_xsrf = soup.find('input', attrs={"name": "_xsrf"}).get("value")
# 处理验证码url地址
captchaURL = "https://www.zhihu.com/captcha.gif?r=%d&type=login" % (time.time() * 1000)
# 发送验证码图片的请求,返回图片数据
captcha_data = ssion.get(captchaURL, headers=headers).content
# post登录表单数据
formdata = {
"email": "123456789@qq.com",
"password": "ABCDEFG",
"captcha": captcha(captcha_data),
"_xsrf": _xsrf
}
# 发送post登录请求,拿到cookie,URL地址不能变
ssion.post("https://www.zhihu.com/login/email", data=formdata, headers=headers)
# 访问登录之后页面,并保存到本地
html = ssion.get("https://www.zhihu.com/settings/account#signin", headers=headers).content
with open("myhtml.html", "w") as f:
f.write(str(html.decode('utf-8')))
if __name__ == "__main__":
login()