bs4练习用,能够爬取教务办通知并存为list形式
下面代码:
import requests import re from bs4 import BeautifulSoup url = "http://csw.jlu.edu.cn/index/" host = "http://csw.jlu.edu.cn/" records = list() # 首页爬取 response = requests.get(f"{url}jbtz.htm") response.encoding = "UTF-8" html = response.text soup = BeautifulSoup(html, "html.parser") lis = soup.find("div", {"class": "text-list"}).find("ul").find_all("li") for li in lis: # print("-----------------------------------------") record = dict() record["time"] = li.find("span", {"class": "time"}).get_text() record["title"] = li.find("div", {"class": "title"}).find("a").get_text() href = li.find("div", {"class": "title"}).find("a", href=True)["href"].replace("../", host) record["href"] = href # print(record) records.append(record) # 查询页数 times = re.findall("<a href=\"jbtz/(.*?).htm\" class=\"Next\">下页</a>", html, re.S) # 全部数据爬取 if times: times = int(times[0]) for i in range(times): print(f"\n{url}jbtz/{times-i}.htm\n") response = requests.get(f"{url}jbtz/{times-i}.htm") response.encoding = "UTF-8" html = response.text soup = BeautifulSoup(html, "html.parser") lis = soup.find("div", {"class": "text-list"}).find("ul").find_all("li") for li in lis: record = dict() record["time"] = li.find("span", {"class": "time"}).get_text()[-10:] record["title"] = li.find("div", {"class": "title"}).find("a").get_text() href = li.find("div", {"class": "title"}).find("a", href=True)["href"].replace("../", host) record["href"] = href print(record) records.append(record)