1、BeautifulSoup的安装
相关文档:安装beautifulsoup4到Python3的方法(系统中默认使用的是Python2.7)
2、使用BeautifulSoup爬取网站网页示例代码
import bs4 import requests response = requests.get("https://en.wikipedia.org/wiki/Mathematics") if response is not None: html = bs4.BeautifulSoup(response.text, 'html.parser') title = html.select("#firstHeading")[0].text paragraphs = html.select("p") for para in paragraphs: print (para.text) # just grab the text up to contents as stated in question intro = '\n'.join([ para.text for para in paragraphs[0:5]]) print (intro)
或
import requests from bs4 import BeautifulSoup page = requests.get('https://web.archive.org/web/20121007172955/https://www.nga.gov/collection/anZ1.htm') soup = BeautifulSoup(page.text, 'html.parser') last_links = soup.find(class_='AlphaNav') last_links.decompose() artist_name_list = soup.find(class_='BodyText') artist_name_list_items = artist_name_list.find_all('a') # Use .contents to pull out the <a> tag’s children for artist_name in artist_name_list_items: names = artist_name.contents[0] print(names)