利用 Selenium 自动化抓取 Web of Science 论文数据：以 IEEE SENSORS JOURNAL 为例

在当今数字化时代，科研工作者面临着海量学术信息的挑战。有效地收集、筛选和分析相关领域的最新研究成果，对于保持科研竞争力至关重要。然而，手动检索和整理学术文献不仅耗时耗力，还容易出现疏漏。为了解决这一问题，我们可以借助自动化工具来提高文献检索的效率和准确性。

本文将介绍如何使用 Python 和 Selenium WebDriver 来自动化抓取 Web of Science 上的论文数据。我们以 IEEE SENSORS JOURNAL 为例，展示了如何编写脚本来模拟用户操作，包括登录、导航、搜索、以及批量提取论文标题和发表日期等信息。这种方法不仅可以大大提高文献收集的效率，还能为后续的数据分析奠定基础。

话不多说，直接上代码：

from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchWindowException, NoSuchElementException, StaleElementReferenceException
# import tkinter as tk
# from tkinter import simpledialog
import time
import datetime
# import re


# 设置账号和密码
# username_str = ''
# password_str = ''
# 获取当前时间
now = datetime.datetime.now()
year = now.strftime("%Y")    # 提取年
month = now.strftime("%m")   # 提取月
day = now.strftime("%d")     # 提取日
print(f"Year: {year}, Month: {month}, Day: {day}")



print("正在尝试正在打开 wuyoutsg.com 网址")

# ChromeDriver 路径
driver_path = r'd:\chromedriver-win64\chromedriver.exe'

# 初始化 WebDriver
wd = webdriver.Chrome(executable_path=driver_path)

# 打开网址
wd.get('http://www.wuyoutsg.com')
time.sleep(3)  # 在这里暂停 3 秒
wd.maximize_window()

print("wuyoutsg.com 网址已经在 Chrome 浏览器打开")


print("正在尝试输入账号")
username_input = wd.find_element(By.XPATH, '//input[@placeholder="用户名"]')  # 这里的XPath根据实际网页内容调整
username_id = username_input.get_attribute('id')
print(f"用户名输入框的ID是: {username_id}")

username_input.send_keys(username_str)  # 输入用户名
password_input = wd.find_element(By.XPATH, '//input[@placeholder="密码"]')  # 这里的XPath根据实际网页内容调整
password_id = password_input.get_attribute('id')
print(f"密码输入框的ID是: {password_id}")
password_input.send_keys(password_str)  # 输入密码

print("已找到账号输入框并输入账号")
print("请完成人机身份验证并点击登录。")

wait = WebDriverWait(wd, 30)  # 最多等待5分钟，你可以根据需要调整这个时间


wait.until(EC.url_to_be("http://www.wuyoutsg.com/e/action/ListInfo/?classid=62")) # 检测URL变化

print("-----账号已登陆成功------")


new_url = 'http://www.wuyoutsg.com/e/action/ListInfo/?classid=202'
print(f"正在尝试打开链接：{new_url}")
wd.get(new_url)
print(f"链接 {new_url} 已成功打开")




try:
    # 等待并点击链接
    wait = WebDriverWait(wd, 30)  # 增加等待时间到20秒
    web_of_science1_link = wait.until(EC.element_to_be_clickable((
        By.XPATH, "//a[@class='entryItem']//strong[contains(., 'Web of Science1')]")))
    print("Found 'Web of Science1' link")

    # 获取当前窗口句柄
    original_window = wd.current_window_handle

    # 点击链接
    web_of_science1_link.click()
    print("Clicked on 'Web of Science1' link")

    # 等待新窗口或标签页
    wait.until(EC.number_of_windows_to_be(2))

    # 切换到新窗口
    for window_handle in wd.window_handles:
        if window_handle!=original_window:
            wd.switch_to.window(window_handle)
            break

    # 等待新页面加载
    wait.until(EC.url_changes(new_url))
    print(f"New page loaded. Current URL: {wd.current_url}")

except TimeoutException:
    print("Timeout waiting for page to load")
except NoSuchWindowException:
    print("The window we were trying to switch to was closed")
except Exception as e:
    print(f"An error occurred: {str(e)}")

# 可以在这里添加更多与新页面交互的代码


try:
    # 等待"接受所有Cookie"按钮出现
    accept_cookie_button = WebDriverWait(wd, 30).until(
        EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))
    )

    # 点击按钮
    accept_cookie_button.click()
    print("已成功点击'接受所有 Cookie'按钮")
    # 等待页面更新（可能需要调整等待时间或条件）
    # 等待Cookie横幅消失
    WebDriverWait(wd, 10).until(
        EC.invisibility_of_element_located((By.ID, "onetrust-banner-sdk"))
    )

    # 等待页面完全加载
    WebDriverWait(wd, 10).until(
        lambda driver: driver.execute_script("return document.readyState")=="complete"
    )

    # 获取更新后的页面源代码
    updated_page_source = wd.page_source

    print("已保存接受Cookie后的页面源代码")

    # 等待页面完全加载
    WebDriverWait(wd, 10).until(
        lambda driver: driver.execute_script("return document.readyState")=="complete"
    )

    # 等待下拉菜单元素可见
    dropdown_button = WebDriverWait(wd, 10).until(
        EC.visibility_of_element_located((By.XPATH, "//button[@aria-label='Select search field Topic']"))
    )

    # 点击下拉菜单按钮
    dropdown_button.click()
    print("已点击下拉菜单按钮")

    # 等待下拉菜单选项出现
    WebDriverWait(wd, 10).until(
        EC.visibility_of_element_located((By.CLASS_NAME, "dropdown-panel"))
    )

    # 获取所有下拉菜单选项
    options = wd.find_elements(By.XPATH, "//div[@role='listbox']/div[@role='option']")

    # 打印所有选项
    print("可用的选项:")
    for option in options:
        print(option.text)

    # 选择一个选项（例如，选择 "Title"）
    for option in options:
        if option.text=="Publication/Source Titles":
            option.click()
            print("已选择 'Publication/Source Titles' 选项")
            break

    # 等待选择生效
    time.sleep(2)

    # 获取更新后的页面源代码
    updated_page_source = wd.page_source
    print("已保存选择选项后的页面源代码")


    wait = WebDriverWait(wd, 10)
    input_field = wait.until(EC.element_to_be_clickable((By.XPATH, "//input[@aria-label='Search box 1']")))


    input_field.send_keys("IEEE SENSORS JOURNAL")
    print("Entered 'IEEE SENSORS JOURNAL' into the search box")

    # 等待一下，确保输入完成
    time.sleep(1)

    # 按Enter键进行搜索
    input_field.send_keys(Keys.RETURN)
    print("已按Enter键进行搜索")

    print("Search input completed")

except Exception as e:
    print(f"点击'接受所有 Cookie'按钮时出现错误: {str(e)}")


# 获取总页数
wait = WebDriverWait(wd, 30)
total_pages_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "span.end-page.ng-star-inserted")))
total_pages = int(total_pages_element.text)

successfully_extracted = 0
page_number = 1

while page_number <= total_pages:
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, "app-records-list")))

    papers_on_page = 50  # 每页固定50篇

    for i in range(1, papers_on_page + 1):
        max_attempts = 10
        for attempt in range(max_attempts):
            try:
                record = wd.find_element(By.CSS_SELECTOR, f"app-record.app-record-holder:nth-child({i})")
                title_element = record.find_element(By.CSS_SELECTOR, "a[data-ta='summary-record-title-link']")
                title = title_element.text
                date_element = record.find_element(By.CSS_SELECTOR, "span[name='pubdate']")
                date = date_element.text

                print(f"论文 {successfully_extracted + 1}:")
                print(f"标题: {title}")
                print(f"日期: {date}")
                print("---")

                successfully_extracted += 1
                break

            except (NoSuchElementException, StaleElementReferenceException):
                if attempt < max_attempts - 1:
                    wd.execute_script("window.scrollBy(0, 300);")
                    time.sleep(1)
                else:
                    print(f"无法提取第 {successfully_extracted + 1} 篇论文信息")

        if attempt == max_attempts - 1:
            wd.execute_script("window.scrollBy(0, 600);")
            time.sleep(1)

    print(f"第 {page_number} 页提取完成")

    if page_number < total_pages:
        # 尝试进入下一页
        try:
            next_button = WebDriverWait(wd, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-ta='next-page-button']"))
            )
            next_button.click()
            page_number += 1
            print(f"正在进入第 {page_number} 页")
            time.sleep(3)  # 等待新页面加载
        except TimeoutException:
            print("无法找到下一页按钮")
            break

        wd.execute_script("window.scrollTo(0, 0);")  # 滚动到页面顶部
        time.sleep(2)  # 等待页面稳定
    else:
        break

print(f"成功提取了 {successfully_extracted} 篇论文信息，总共 {total_pages} 页")