基于python实现垂直爬虫系统的方法详解
来源:脚本之家    时间:2022-03-04 12:27:55

html_downloader

from urllib import request
def download(url):
    if url is None:
        return
    response = request.urlopen(url)
    if response.getcode() != 200:
        return None
    return response.read()

html_outeputer

data_list = []
def collect_data(data):
    data_list.append(data)
def output_html():
    fout = open("output.html", "w")
    fout.write("")
    fout.write("")
    fout.write("")
    for dataitem in data_list:
        fout.write("")
        fout.write("" % dataitem["url"])
        fout.write("" % dataitem["title"])
        fout.write("" % dataitem["datetime"])
        fout.write("" % dataitem["visitcount"])
        fout.write("")
    fout.write("
%s%s%s%s
") fout.write("") fout.write("") fout.close()

html_parser

import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin
def get_new_urls(page_url, soup):
    new_urls = set()
    links = soup.find_all("a", href=re.compile(r"/\d+/\d+/\w+/page\.htm"))
    for link in links:
        new_url = link["href"]
        new_full_url = urljoin(page_url, new_url)
        new_urls.add(new_full_url)
    return new_urls
def get_new_data(page_url, soup):
    res_data = {}
    title_node = soup.find("h1", class_="arti-title")
    if title_node is None:
        return res_data
    res_data["title"] = title_node.get_text()
    datetime_node = soup.find("span", class_="arti-update")
    res_data["datetime"] = datetime_node.get_text()
    visitcount_node = soup.find("span", class_="WP_VisitCount")
    res_data["visitcount"] = visitcount_node.get_text()
    res_data["url"] = page_url
    return res_data
def parse(page_url, html_cont):
    if page_url is None or html_cont is None:
        return
    soup = BeautifulSoup(html_cont, "html.parser", from_encoding="utf-8")
    new_urls = get_new_urls(page_url, soup)
    new_data = get_new_data(page_url, soup)
    return new_urls, new_data

spider_main

import urls_manager, html_downloader, \
    html_parser, html_outputer
def craw(root_url):
    count = 1
    urls_manager.add_new_url(root_url)
    #启动爬虫循环
    while urls_manager.has_new_url():
        new_url = urls_manager.get_new_url()
        print("craw %d : %s" % (count, new_url))
        html_cont = html_downloader.download(new_url)
        new_urls, new_data = html_parser.parse(new_url, html_cont)
        urls_manager.add_new_urls(new_urls)
        if new_data:
            html_outputer.collect_data(new_data)
        if count == 10:
            break
        count = count + 1
    html_outputer.output_html()

if __name__ == "__main__":
    root_url = "http://news.zzuli.edu.cn/"
    craw(root_url)
import urls_manager, html_downloader, \
    html_parser, html_outputer
def craw(root_url):
    count = 1
    urls_manager.add_new_url(root_url)
    #启动爬虫循环
    while urls_manager.has_new_url():
        new_url = urls_manager.get_new_url()
        print("craw %d : %s" % (count, new_url))
        html_cont = html_downloader.download(new_url)
        new_urls, new_data = html_parser.parse(new_url, html_cont)
        urls_manager.add_new_urls(new_urls)
        if new_data:
            html_outputer.collect_data(new_data)
        if count == 10:
            break
        count = count + 1
    html_outputer.output_html()

if __name__ == "__main__":
    root_url = "http://news.zzuli.edu.cn/"
    craw(root_url)

test_64

from bs4 import BeautifulSoup
import re
html_doc = """
The Dormouse"s story

The Dormouse"s story

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

""" soup = BeautifulSoup(html_doc, "html.parser") print("获取所有链接") links = soup.find_all("a") for link in links: print(link.name, link["href"], link.get_text()) print("获取lacie链接") link_node = soup.find("a", href="http://example.com/lacie") print(link_node.name, link_node["href"], link_node.get_text()) print("正则匹配") link_node = soup.find("a", href=re.compile(r"ill")) print(link_node.name, link_node["href"], link_node.get_text()) print("获取P段落文字") p_node = soup.find("p", class_="title") print(p_node.name, p_node.get_text())

urls_manager

new_urls = set()
old_urls = set()
def add_new_url(url):
    if url is None:
        return
    if url not in new_urls and url not in old_urls:
        new_urls.add(url)
def add_new_urls(urls):
    if urls is None or len(urls) == 0:
        return
    for url in urls:
        add_new_url(url)
def get_new_url():
    new_url = new_urls.pop()
    old_urls.add(new_url)
    return new_url
def has_new_url():
    return len(new_urls) != 0

总结

本篇文章就到这里了,希望能够给你带来帮助,也希望您能够多多关注脚本之家的更多内容!

关键词:

X 关闭

X 关闭