^ 以某个开头
$ 以某个结尾
* 某个任意多次,大于等于0
? 让某个取消贪婪匹配,可以理解为改为从左到右匹配到某个为止
+ 某个至少为一次,大于等于1
{ }例,{2,5},某个出现2到5次.....{2},{2,}等
| 或者(a|b),选a或者b
[ ]有三种意思,1.[13567]中括号的任选一个-------2.[0-9],[a-z]-------3.[.]就代表.号,不代表任意字符了
[^]例,[^1]非1
[a-z]同上
/s空格
/S非空格
/w代表[A-Za-z0-9_]
/W代表非[A-Za-z0-9_]
[u4E00-u9FA5]代表汉字
( )略
/d数字
import re
line1 = "你出生在2016-09-01"
line2 = "你出生在2016-9-1"
line3 = "你出生在2016/09/01"
line4 = "你出生在2016年9月1号"
line5 = "你出生在2016-09"
regex_str = "(你出生在\d{4}(-|/|年)\d{1,2}($|(月|/|-)\d{1,2}($|号)))"
match_obj = re.match(regex_str, line5)
if match_obj:
print(match_obj.group(1))
#五个line都能匹配
简单的爬虫
# -*- coding: utf-8 -*-
# @Author: Lai
import re
import os
import requests
BASE_PATH = "E:/EW/"
def get_block_link(url):
html_obj = requests.get(url)
reg = '<a target="_blank" href="(.*?)" class="l mr10"><img onerror'
link_list = re.findall(reg, html_obj.text, re.S)
return link_list
# 得到标题和多章节的链接
def get_child_link(url):
links = []
for link in get_block_link(url):
html_obj = requests.get(link)
reg = '<a href="(.*?)" class="reader" title=".*?">.*?</a>'
link = re.findall(reg, html_obj.text)[0]
links.append(link)
return links
def get_charterName_and_content(url):
html_obj = requests.get(url)
html_obj.encoding = "GBK"
reg_content = 'id="content"><script type="text/javascript">style5\(\);</script>(.*?)<script type="text/javascript">'
if re.findall(reg_content, html_obj.text, re.S):
content = re.findall(reg_content, html_obj.text, re.S)[0]
else:
content = "空"
reg = ' class="article_title">.*?</a> >.*?\s(.*?)</div>'
if re.findall(reg, html_obj.text, re.S):
charter_name = re.findall(reg, html_obj.text, re.S)[0]
else:
charter_name = "空"
res = (charter_name, content)
return res
# 得到每个章节的链接
def get_child_in_info(url):
for link in get_child_link(url):
html_obj = requests.get(link)
html_obj.encoding = "GBK"
reg = '<div class="chapName"><span class="r">.*?</span><strong>(.*?)</strong><div class="clear">'
if re.findall(reg, html_obj.text, re.S):
title = re.findall(reg, html_obj.text, re.S)[0]
else:
title = "空"
path = os.path.join(BASE_PATH, title)
reg_url = '<li><a href="(.*?)" title=".*?">.*?</a></li>'
urls = re.findall(reg_url, html_obj.text)
for url in urls:
chart_name, content = get_charterName_and_content(url)
if not os.path.exists(path):
os.mkdir(path)
else:
pass
position = os.path.join(path, chart_name+".html")
with open(position, "w") as f:
f.write(content)
if __name__ == "__main__":
real_url = "http://www.quanshuwang.com/list/1_1.html"
# get_block_link(real_url)
# print(get_child_link(real_url))
get_child_in_info(real_url)