#!/user/bin/python
# -*- coding: UTF-8 -*-
import urllib
import urllib2
import lxml
import re
import MySQLdb
import time
from bs4 import BeautifulSoup
import httplib
httplib.HTTPConnection._http_vsn = 10
httplib.HTTPConnection._http_vsn_str = 'HTTP/1.1'
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36'
hdr = { 'User-Agent' : user_agent }
db = MySQLdb.connect(host="localhost", port=3306, user="root", passwd="123456", db="xiaoshuo", charset="utf8")
str_sql = '''INSERT INTO `xiaoshuo`.`book1` (`bookName`, `author`, `url`, `classifyName`, `brief`, `updateTime`,
`status`) VALUES'''
def getBookInfoBaseOnUrl(url):
global str_sql
request = urllib2.Request(url, headers=hdr)
response = urllib2.urlopen(request)
html_data = response.read().decode('gbk')
soup = BeautifulSoup(html_data,'lxml')
mylist = soup.select('head')
for item in mylist:
bookName = item.find(property="og:novel:book_name").get("content")
print "书名:", bookName
author = item.find(property="og:novel:author").get("content")
print "作者:", author
url = item.find(property="og:novel:read_url").get("content")
print "链接:", url
classifyName = item.find(property="og:novel:category").get("content")
print "类型:", classifyName
description = item.find(property="og:description").get("content")
print "brief:", description
updateTime = item.find(property="og:novel:update_time").get("content")
print "更新时间:", updateTime
status = item.find(property="og:novel:status").get("content")
print "status:", status
str_sql += '("' + bookName + '", "' + author + '", "' + url + '", "' + classifyName + '", "' + description + '", "' + updateTime + '", "' + status + '"),'
print "-----------------------------------------------------------------------------------------"
def get_book( ):
global str_sql
cursor = db.cursor()
#count = 0
soup = BeautifulSoup(open('biquge.html'),'lxml')
mylist = soup.find_all('div', class_ ='content')
for item in mylist:
#print item
xiaoshuo_list = item.find_all('li')
for item in xiaoshuo_list:
#coutn2 = 0
url = item.find('a').get('href')
print "书的连接:" , url
getBookInfoBaseOnUrl(url)
#coutn2 = coutn2 + 1
#if coutn2 == 2:
#break
#count = count + 1
#if count == 2:
#break
for item in mylist:
xiaoshuo_list = item.find_all('dl')
for item in xiaoshuo_list:
url = item.find('a').get('href')
print "书的连接:", url
getBookInfoBaseOnUrl(url)
str_sql = str_sql.encode("utf-8")
str_sql = str_sql[0:len(str_sql)-1]
print "tmp_sql:", str_sql
cursor.execute(str_sql)
db.commit()
cursor.close()
if __name__ == "__main__":
print ("<<<-----Start Get Book INFO And Save Db------>>")
get_book()
db.close()
print str_sql
python爬虫实战(四)
原文作者:后打开撒打发了
原文地址: https://blog.csdn.net/chenxun_2010/article/details/77896531
本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
原文地址: https://blog.csdn.net/chenxun_2010/article/details/77896531
本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。