scrapy学习之路1.1(正则表达式)

2023年5月20日 130次阅读来源: lilied

^ 以某个开头

$ 以某个结尾

* 某个任意多次，大于等于0

? 让某个取消贪婪匹配，可以理解为改为从左到右匹配到某个为止

+ 某个至少为一次，大于等于1

{ }例,{2,5},某个出现2到5次.....{2},{2,}等

| 或者(a|b),选a或者b

[ ]有三种意思，1.[13567]中括号的任选一个-------2.[0-9],[a-z]-------3.[.]就代表.号，不代表任意字符了

[^]例,[^1]非1

[a-z]同上

/s空格

/S非空格

/w代表[A-Za-z0-9_]

/W代表非[A-Za-z0-9_]

[u4E00-u9FA5]代表汉字

( )略

/d数字

import re
line1 = "你出生在2016-09-01"
line2 = "你出生在2016-9-1"
line3 = "你出生在2016/09/01"
line4 = "你出生在2016年9月1号"
line5 = "你出生在2016-09"

regex_str = "(你出生在\d{4}(-|/|年)\d{1,2}($|(月|/|-)\d{1,2}($|号)))"
match_obj = re.match(regex_str, line5)
if match_obj:
    print(match_obj.group(1))
    
#五个line都能匹配

简单的爬虫

# -*- coding: utf-8 -*-
# @Author: Lai

import re
import os
import requests

BASE_PATH = "E:/EW/"


def get_block_link(url):
    html_obj = requests.get(url)
    reg = '<a target="_blank" href="(.*?)" class="l mr10"><img  onerror'
    link_list = re.findall(reg, html_obj.text, re.S)
    return link_list


# 得到标题和多章节的链接
def get_child_link(url):
    links = []
    for link in get_block_link(url):
        html_obj = requests.get(link)
        reg = '<a href="(.*?)" class="reader" title=".*?">.*?</a>'
        link = re.findall(reg, html_obj.text)[0]
        links.append(link)
    return links


def get_charterName_and_content(url):
    html_obj = requests.get(url)
    html_obj.encoding = "GBK"
    reg_content = 'id="content"><script type="text/javascript">style5\(\);</script>(.*?)<script type="text/javascript">'
    if re.findall(reg_content, html_obj.text, re.S):
        content = re.findall(reg_content, html_obj.text, re.S)[0]
    else:
        content = "空"
    reg = ' class="article_title">.*?</a>  >.*?\s(.*?)</div>'
    if re.findall(reg, html_obj.text, re.S):
        charter_name = re.findall(reg, html_obj.text, re.S)[0]
    else:
        charter_name = "空"
    res = (charter_name, content)
    return res


# 得到每个章节的链接
def get_child_in_info(url):
    for link in get_child_link(url):
        html_obj = requests.get(link)
        html_obj.encoding = "GBK"
        reg = '<div class="chapName"><span class="r">.*?</span><strong>(.*?)</strong><div class="clear">'
        if re.findall(reg, html_obj.text, re.S):
            title = re.findall(reg, html_obj.text, re.S)[0]
        else:
            title = "空"
        path = os.path.join(BASE_PATH, title)
        reg_url = '<li><a href="(.*?)" title=".*?">.*?</a></li>'
        urls = re.findall(reg_url, html_obj.text)
        for url in urls:
            chart_name, content = get_charterName_and_content(url)
            if not os.path.exists(path):
                os.mkdir(path)
            else:
                pass
            position = os.path.join(path, chart_name+".html")
            with open(position, "w") as f:
                f.write(content)


if __name__ == "__main__":
    real_url = "http://www.quanshuwang.com/list/1_1.html"
    # get_block_link(real_url)
    # print(get_child_link(real_url))
    get_child_in_info(real_url)

    原文作者：lilied
    原文地址: https://segmentfault.com/a/1190000012738273
    本文转自网络文章，转载此文章仅为分享知识，如有侵权，请联系博主进行删除。