bs4练习用,能够爬取教务办通知并存为list形式
下面代码:
import requests
import re
from bs4 import BeautifulSoup
url = "http://csw.jlu.edu.cn/index/"
host = "http://csw.jlu.edu.cn/"
records = list()
# 首页爬取
response = requests.get(f"{url}jbtz.htm")
response.encoding = "UTF-8"
html = response.text
soup = BeautifulSoup(html, "html.parser")
lis = soup.find("div", {"class": "text-list"}).find("ul").find_all("li")
for li in lis:
# print("-----------------------------------------")
record = dict()
record["time"] = li.find("span", {"class": "time"}).get_text()
record["title"] = li.find("div", {"class": "title"}).find("a").get_text()
href = li.find("div", {"class": "title"}).find("a", href=True)["href"].replace("../", host)
record["href"] = href
# print(record)
records.append(record)
# 查询页数
times = re.findall("<a href=\"jbtz/(.*?).htm\" class=\"Next\">下页</a>", html, re.S)
# 全部数据爬取
if times:
times = int(times[0])
for i in range(times):
print(f"\n{url}jbtz/{times-i}.htm\n")
response = requests.get(f"{url}jbtz/{times-i}.htm")
response.encoding = "UTF-8"
html = response.text
soup = BeautifulSoup(html, "html.parser")
lis = soup.find("div", {"class": "text-list"}).find("ul").find_all("li")
for li in lis:
record = dict()
record["time"] = li.find("span", {"class": "time"}).get_text()[-10:]
record["title"] = li.find("div", {"class": "title"}).find("a").get_text()
href = li.find("div", {"class": "title"}).find("a", href=True)["href"].replace("../", host)
record["href"] = href
print(record)
records.append(record)