在pycharm中编写如下代码:
import requests
from bs4 import BeautifulSoup
import bs4
import re
def getHTMLText ( url) :
try :
r = requests. get( url, timeout = 30 )
r. raise_for_status( )
r. encoding = r. apparent_encoding
return r. text
except :
return ""
def remove_spaces_re1 ( string) :
return re. sub( r"\s+" , "" , string)
def fillUnivList ( ulist, html) :
soup = BeautifulSoup( html, "html.parser" )
for tr in soup. find( 'tbody' ) . children:
if isinstance ( tr, bs4. element. Tag) :
tds = tr( 'td' )
name = tds[ 1 ] ( 'div' ) [ 2 ] ( 'a' ) [ 0 ] . string
rank = ( str ) ( tds[ 0 ] . string)
score = ( str ) ( tds[ 4 ] . string)
ulist. append( [ remove_spaces_re1( rank) , remove_spaces_re1( name) , remove_spaces_re1( score) ] )
def printUnivList ( ulist, num) :
print ( "{:^20}\t{:^16}\t{:^20}" . format ( "排名" , "学校名称" , "总分" ) )
for i in range ( num) :
u = ulist[ i]
print ( "{:^20}\t{:^16}\t{:^20}" . format ( u[ 0 ] , u[ 1 ] , u[ 2 ] ) )
if __name__ == '__main__' :
uinfo = [ ]
url = "https://www.shanghairanking.cn/rankings/bcur/2024"
html = getHTMLText( url)
fillUnivList( uinfo, html)
printUnivList( uinfo, 20 )
运行结果: