在当今数字化时代,科研工作者面临着海量学术信息的挑战。有效地收集、筛选和分析相关领域的最新研究成果,对于保持科研竞争力至关重要。然而,手动检索和整理学术文献不仅耗时耗力,还容易出现疏漏。为了解决这一问题,我们可以借助自动化工具来提高文献检索的效率和准确性。
本文将介绍如何使用 Python 和 Selenium WebDriver 来自动化抓取 Web of Science 上的论文数据。我们以 IEEE SENSORS JOURNAL 为例,展示了如何编写脚本来模拟用户操作,包括登录、导航、搜索、以及批量提取论文标题和发表日期等信息。这种方法不仅可以大大提高文献收集的效率,还能为后续的数据分析奠定基础。
话不多说,直接上代码:
from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchWindowException, NoSuchElementException, StaleElementReferenceException
# import tkinter as tk
# from tkinter import simpledialog
import time
import datetime
# import re
# 设置账号和密码
# username_str = ''
# password_str = ''
# 获取当前时间
now = datetime.datetime.now()
year = now.strftime("%Y") # 提取年
month = now.strftime("%m") # 提取月
day = now.strftime("%d") # 提取日
print(f"Year: {year}, Month: {month}, Day: {day}")
print("正在尝试正在打开 wuyoutsg.com 网址")
# ChromeDriver 路径
driver_path = r'd:\chromedriver-win64\chromedriver.exe'
# 初始化 WebDriver
wd = webdriver.Chrome(executable_path=driver_path)
# 打开网址
wd.get('http://www.wuyoutsg.com')
time.sleep(3) # 在这里暂停 3 秒
wd.maximize_window()
print("wuyoutsg.com 网址已经在 Chrome 浏览器打开")
print("正在尝试输入账号")
username_input = wd.find_element(By.XPATH, '//input[@placeholder="用户名"]') # 这里的XPath根据实际网页内容调整
username_id = username_input.get_attribute('id')
print(f"用户名输入框的ID是: {username_id}")
username_input.send_keys(username_str) # 输入用户名
password_input = wd.find_element(By.XPATH, '//input[@placeholder="密码"]') # 这里的XPath根据实际网页内容调整
password_id = password_input.get_attribute('id')
print(f"密码输入框的ID是: {password_id}")
password_input.send_keys(password_str) # 输入密码
print("已找到账号输入框并输入账号")
print("请完成人机身份验证并点击登录。")
wait = WebDriverWait(wd, 30) # 最多等待5分钟,你可以根据需要调整这个时间
wait.until(EC.url_to_be("http://www.wuyoutsg.com/e/action/ListInfo/?classid=62")) # 检测URL变化
print("-----账号已登陆成功------")
new_url = 'http://www.wuyoutsg.com/e/action/ListInfo/?classid=202'
print(f"正在尝试打开链接:{new_url}")
wd.get(new_url)
print(f"链接 {new_url} 已成功打开")
try:
# 等待并点击链接
wait = WebDriverWait(wd, 30) # 增加等待时间到20秒
web_of_science1_link = wait.until(EC.element_to_be_clickable((
By.XPATH, "//a[@class='entryItem']//strong[contains(., 'Web of Science1')]")))
print("Found 'Web of Science1' link")
# 获取当前窗口句柄
original_window = wd.current_window_handle
# 点击链接
web_of_science1_link.click()
print("Clicked on 'Web of Science1' link")
# 等待新窗口或标签页
wait.until(EC.number_of_windows_to_be(2))
# 切换到新窗口
for window_handle in wd.window_handles:
if window_handle!=original_window:
wd.switch_to.window(window_handle)
break
# 等待新页面加载
wait.until(EC.url_changes(new_url))
print(f"New page loaded. Current URL: {wd.current_url}")
except TimeoutException:
print("Timeout waiting for page to load")
except NoSuchWindowException:
print("The window we were trying to switch to was closed")
except Exception as e:
print(f"An error occurred: {str(e)}")
# 可以在这里添加更多与新页面交互的代码
try:
# 等待"接受所有Cookie"按钮出现
accept_cookie_button = WebDriverWait(wd, 30).until(
EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))
)
# 点击按钮
accept_cookie_button.click()
print("已成功点击'接受所有 Cookie'按钮")
# 等待页面更新(可能需要调整等待时间或条件)
# 等待Cookie横幅消失
WebDriverWait(wd, 10).until(
EC.invisibility_of_element_located((By.ID, "onetrust-banner-sdk"))
)
# 等待页面完全加载
WebDriverWait(wd, 10).until(
lambda driver: driver.execute_script("return document.readyState")=="complete"
)
# 获取更新后的页面源代码
updated_page_source = wd.page_source
print("已保存接受Cookie后的页面源代码")
# 等待页面完全加载
WebDriverWait(wd, 10).until(
lambda driver: driver.execute_script("return document.readyState")=="complete"
)
# 等待下拉菜单元素可见
dropdown_button = WebDriverWait(wd, 10).until(
EC.visibility_of_element_located((By.XPATH, "//button[@aria-label='Select search field Topic']"))
)
# 点击下拉菜单按钮
dropdown_button.click()
print("已点击下拉菜单按钮")
# 等待下拉菜单选项出现
WebDriverWait(wd, 10).until(
EC.visibility_of_element_located((By.CLASS_NAME, "dropdown-panel"))
)
# 获取所有下拉菜单选项
options = wd.find_elements(By.XPATH, "//div[@role='listbox']/div[@role='option']")
# 打印所有选项
print("可用的选项:")
for option in options:
print(option.text)
# 选择一个选项(例如,选择 "Title")
for option in options:
if option.text=="Publication/Source Titles":
option.click()
print("已选择 'Publication/Source Titles' 选项")
break
# 等待选择生效
time.sleep(2)
# 获取更新后的页面源代码
updated_page_source = wd.page_source
print("已保存选择选项后的页面源代码")
wait = WebDriverWait(wd, 10)
input_field = wait.until(EC.element_to_be_clickable((By.XPATH, "//input[@aria-label='Search box 1']")))
input_field.send_keys("IEEE SENSORS JOURNAL")
print("Entered 'IEEE SENSORS JOURNAL' into the search box")
# 等待一下,确保输入完成
time.sleep(1)
# 按Enter键进行搜索
input_field.send_keys(Keys.RETURN)
print("已按Enter键进行搜索")
print("Search input completed")
except Exception as e:
print(f"点击'接受所有 Cookie'按钮时出现错误: {str(e)}")
# 获取总页数
wait = WebDriverWait(wd, 30)
total_pages_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "span.end-page.ng-star-inserted")))
total_pages = int(total_pages_element.text)
successfully_extracted = 0
page_number = 1
while page_number <= total_pages:
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "app-records-list")))
papers_on_page = 50 # 每页固定50篇
for i in range(1, papers_on_page + 1):
max_attempts = 10
for attempt in range(max_attempts):
try:
record = wd.find_element(By.CSS_SELECTOR, f"app-record.app-record-holder:nth-child({i})")
title_element = record.find_element(By.CSS_SELECTOR, "a[data-ta='summary-record-title-link']")
title = title_element.text
date_element = record.find_element(By.CSS_SELECTOR, "span[name='pubdate']")
date = date_element.text
print(f"论文 {successfully_extracted + 1}:")
print(f"标题: {title}")
print(f"日期: {date}")
print("---")
successfully_extracted += 1
break
except (NoSuchElementException, StaleElementReferenceException):
if attempt < max_attempts - 1:
wd.execute_script("window.scrollBy(0, 300);")
time.sleep(1)
else:
print(f"无法提取第 {successfully_extracted + 1} 篇论文信息")
if attempt == max_attempts - 1:
wd.execute_script("window.scrollBy(0, 600);")
time.sleep(1)
print(f"第 {page_number} 页提取完成")
if page_number < total_pages:
# 尝试进入下一页
try:
next_button = WebDriverWait(wd, 10).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-ta='next-page-button']"))
)
next_button.click()
page_number += 1
print(f"正在进入第 {page_number} 页")
time.sleep(3) # 等待新页面加载
except TimeoutException:
print("无法找到下一页按钮")
break
wd.execute_script("window.scrollTo(0, 0);") # 滚动到页面顶部
time.sleep(2) # 等待页面稳定
else:
break
print(f"成功提取了 {successfully_extracted} 篇论文信息,总共 {total_pages} 页")
账号:300157
密码:47134775