吉林大学软件学院教务办通知爬虫

bs4练习用,能够爬取教务办通知并存为list形式

下面代码:

import requests
import re
from bs4 import BeautifulSoup

url = "http://csw.jlu.edu.cn/index/"

host = "http://csw.jlu.edu.cn/"

records = list()

# 首页爬取
response = requests.get(f"{url}jbtz.htm")
response.encoding = "UTF-8"
html = response.text
soup = BeautifulSoup(html, "html.parser")
lis = soup.find("div", {"class": "text-list"}).find("ul").find_all("li")
for li in lis:
    # print("-----------------------------------------")
    record = dict()
    record["time"] = li.find("span", {"class": "time"}).get_text()
    record["title"] = li.find("div", {"class": "title"}).find("a").get_text()
    href = li.find("div", {"class": "title"}).find("a", href=True)["href"].replace("../", host)
    record["href"] = href
    # print(record)
    records.append(record)

# 查询页数
times = re.findall("<a href=\"jbtz/(.*?).htm\" class=\"Next\">下页</a>", html, re.S)

# 全部数据爬取
if times:
    times = int(times[0])
    for i in range(times):
        print(f"\n{url}jbtz/{times-i}.htm\n")
        response = requests.get(f"{url}jbtz/{times-i}.htm")
        response.encoding = "UTF-8"
        html = response.text
        soup = BeautifulSoup(html, "html.parser")
        lis = soup.find("div", {"class": "text-list"}).find("ul").find_all("li")
        for li in lis:
            record = dict()
            record["time"] = li.find("span", {"class": "time"}).get_text()[-10:]
            record["title"] = li.find("div", {"class": "title"}).find("a").get_text()
            href = li.find("div", {"class": "title"}).find("a", href=True)["href"].replace("../", host)
            record["href"] = href
            print(record)
            records.append(record)

 

点赞

发表评论