一、基本认识
1、发送一个get请求
import requests
if __name__ == “__main__”:
# 获取一个get请求
response = requests.get(‘http://httpbin.org/get‘)
2、关于获取请求到数据常见的返回值
import requests
if __name__ == “__main__”:
# 获取一个get请求
response = requests.get(‘http://httpbin.org/get‘)
# 对抓取的网站设置编码
response.encoding = ‘utf-8’
# 打印返回的数据
print(response.text)
print(response.json())
print(response.headers)
print(response.status_code)
print(response.url)
print(response.cookies)
print(response.json())
# 获取最原始的字符串,没有编码的(用户response.text出现乱码的时候,及下载二进制文件的时候)
print(response.content)
3、关于其他的请求方式
response = requests.post(‘http://httpbin.org/post‘)
response = requests.put(‘http://httpbin.org/put‘)
response = requests.delete(‘http://httpbin.org/delete‘)
response = requests.head(‘http://httpbin.org/get‘)
response = requests.options(‘http://httpbin.org/get‘)
1、直接在url地址后面拼接参数
import requests
if __name__ == “__main__”:
# 定义一个请求头(模拟浏览器)
headers = {‘User-Agent’: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36’}
# 设置参数
data = {‘name’: ‘june’, ‘password’: 123456}
# 获取一个get请求
response = requests.get(‘http://httpbin.org/get?name=june&password=123456‘, headers=headers)
# 对抓取的网站设置编码
response.encoding = ‘utf-8’
print(response.text)
2、使用params传递参数
import requests
if __name__ == “__main__”:
# 定义一个请求头(模拟浏览器)
headers = {‘User-Agent’: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36’}
# 设置参数
data = {‘name’: ‘june’, ‘password’: 123456}
# 获取一个get请求
response = requests.get(‘http://httpbin.org/get‘, headers=headers, params=data)
# 对抓取的网站设置编码
response.encoding = ‘utf-8’
print(response.text)
1、需要下载的伯乐在线的文章标题
2、书写逻辑代码
import re
import requests
if __name__ == “__main__”:
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36’,
}
url = ‘http://python.jobbole.com/category/guide/‘
response = requests.get(url=url, headers=headers)
pattern = re.compile(
‘<div.*?post-thumb.*?title=”(.*?)”.*?</a>’, re.S
)
print(response.status_code)
result_list = re.findall(pattern, response.text)
f = open(‘jobbole1.txt’, ‘a+’, encoding=’utf8′)
for item in result_list:
f.write(item.strip() + ‘\n’)
f.close()
3、解说正则表达式
.*?表示非贪婪的匹配任何字符
re.S 使.匹配包括换行在内的全部字符
1、导包
import re
import os
import shutil
import requests
2、定义一个下载图片的类
class DownPic(object):
def __init__(self):
self.url = ‘http://python.jobbole.com/category/guide/‘
self.headers = {
‘User-Agent’: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36’,
}
self.create_dir()
def create_dir(cls):
# 如果文件夹存在就删除
if os.path.exists(‘demo’):
shutil.rmtree(‘demo’)
os.makedirs(‘demo’)
def get_html(self):
response = requests.get(url=self.url, headers=self.headers)
return response.text
def pattern(self):
pattern = re.compile(
‘<div.*?post-thumb.*?src=”(.*?)”.*?</a>’, re.S
)
result_list = re.findall(pattern, self.get_html())
return result_list
def download(self):
for item in self.pattern():
# 获取到的图片地址再次请求
if item.rsplit(‘.’)[-1] in [‘png’, ‘jpg’]:
resp = requests.get(item.strip())
try:
with open(os.path.join(‘demo’, item.strip().rsplit(“/”)[-1]), ‘wb’) as f:
f.write(resp.content)
except Exception as e:
print(e)
else:
continue
3、调用
if __name__ == “__main__”:
p = DownPic()
p.download()
1、格式
response = requests.post(‘http://httpbin.org/post‘, headers=headers, data=data)
2、发送数据到服务器端
import requests
if __name__ == “__main__”:
# 定义一个请求头(模拟浏览器)
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36’}
# 设置参数
data = {’email’: ‘22@qq.com‘, ‘password’: 123456}
# 获取一个get请求
response = requests.post(‘https://httpbin.org/post‘, headers=headers, data=data)
# 对抓取的网站设置编码
response.encoding = ‘utf-8’
print(response.text)
import requests
if __name__ == “__main__”:
url = ‘https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false‘
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36’,
‘Referer’: ‘https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=‘
}
data = {
‘first’: ‘true’,
‘pn’: ‘1’,
‘kd’: ‘python’,
}
response = requests.post(url=url, headers=headers, data=data)
print(response.json())