基于python实现垂直爬虫系统的方法详解
html_downloader
from urllib import request def download(url): if url is None: return response = request.urlopen(url) if response.getcode() != 200: return None return response.read()
html_outeputer
data_list = [] def collect_data(data): data_list.append(data) def output_html(): fout = open("output.html", "w") fout.write("") fout.write("") fout.write("
%s | " % dataitem["url"]) fout.write("%s | " % dataitem["title"]) fout.write("%s | " % dataitem["datetime"]) fout.write("%s | " % dataitem["visitcount"]) fout.write("
html_parser
import re from bs4 import BeautifulSoup from urllib.parse import urljoin def get_new_urls(page_url, soup): new_urls = set() links = soup.find_all("a", href=re.compile(r"/\d+/\d+/\w+/page\.htm")) for link in links: new_url = link["href"] new_full_url = urljoin(page_url, new_url) new_urls.add(new_full_url) return new_urls def get_new_data(page_url, soup): res_data = {} title_node = soup.find("h1", class_="arti-title") if title_node is None: return res_data res_data["title"] = title_node.get_text() datetime_node = soup.find("span", class_="arti-update") res_data["datetime"] = datetime_node.get_text() visitcount_node = soup.find("span", class_="WP_VisitCount") res_data["visitcount"] = visitcount_node.get_text() res_data["url"] = page_url return res_data def parse(page_url, html_cont): if page_url is None or html_cont is None: return soup = BeautifulSoup(html_cont, "html.parser", from_encoding="utf-8") new_urls = get_new_urls(page_url, soup) new_data = get_new_data(page_url, soup) return new_urls, new_data
spider_main
import urls_manager, html_downloader, \ html_parser, html_outputer def craw(root_url): count = 1 urls_manager.add_new_url(root_url) #启动爬虫循环 while urls_manager.has_new_url(): new_url = urls_manager.get_new_url() print("craw %d : %s" % (count, new_url)) html_cont = html_downloader.download(new_url) new_urls, new_data = html_parser.parse(new_url, html_cont) urls_manager.add_new_urls(new_urls) if new_data: html_outputer.collect_data(new_data) if count == 10: break count = count + 1 html_outputer.output_html() if __name__ == "__main__": root_url = "http://news.zzuli.edu.cn/" craw(root_url) import urls_manager, html_downloader, \ html_parser, html_outputer def craw(root_url): count = 1 urls_manager.add_new_url(root_url) #启动爬虫循环 while urls_manager.has_new_url(): new_url = urls_manager.get_new_url() print("craw %d : %s" % (count, new_url)) html_cont = html_downloader.download(new_url) new_urls, new_data = html_parser.parse(new_url, html_cont) urls_manager.add_new_urls(new_urls) if new_data: html_outputer.collect_data(new_data) if count == 10: break count = count + 1 html_outputer.output_html() if __name__ == "__main__": root_url = "http://news.zzuli.edu.cn/" craw(root_url)
test_64
from bs4 import BeautifulSoup import re html_doc = """The Dormouse"s story The Dormouse"s story
Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.
...
""" soup = BeautifulSoup(html_doc, "html.parser") print("获取所有链接") links = soup.find_all("a") for link in links: print(link.name, link["href"], link.get_text()) print("获取lacie链接") link_node = soup.find("a", href="http://example.com/lacie") print(link_node.name, link_node["href"], link_node.get_text()) print("正则匹配") link_node = soup.find("a", href=re.compile(r"ill")) print(link_node.name, link_node["href"], link_node.get_text()) print("获取P段落文字") p_node = soup.find("p", class_="title") print(p_node.name, p_node.get_text())
urls_manager
new_urls = set() old_urls = set() def add_new_url(url): if url is None: return if url not in new_urls and url not in old_urls: new_urls.add(url) def add_new_urls(urls): if urls is None or len(urls) == 0: return for url in urls: add_new_url(url) def get_new_url(): new_url = new_urls.pop() old_urls.add(new_url) return new_url def has_new_url(): return len(new_urls) != 0
总结
本篇文章就到这里了,希望能够给你带来帮助,也希望您能够多多关注脚本之家的更多内容!
关键词:
X 关闭
X 关闭
- 15G资费不大降!三大运营商谁提供的5G网速最快?中国信通院给出答案
- 2联想拯救者Y70发布最新预告:售价2970元起 迄今最便宜的骁龙8+旗舰
- 3亚马逊开始大规模推广掌纹支付技术 顾客可使用“挥手付”结账
- 4现代和起亚上半年出口20万辆新能源汽车同比增长30.6%
- 5如何让居民5分钟使用到各种设施?沙特“线性城市”来了
- 6AMD实现连续8个季度的增长 季度营收首次突破60亿美元利润更是翻倍
- 7转转集团发布2022年二季度手机行情报告:二手市场“飘香”
- 8充电宝100Wh等于多少毫安?铁路旅客禁止、限制携带和托运物品目录
- 9好消息!京东与腾讯续签三年战略合作协议 加强技术创新与供应链服务
- 10名创优品拟通过香港IPO全球发售4100万股 全球发售所得款项有什么用处?