selenium爬取多个网站及通过GUI界面点击爬取

selenium爬取代码

webcrawl.py

import re
import time
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


class AgriInfoSpider:
    def __init__(self):
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # 无界面模式
        chrome_options.add_argument("--start-maximized")
        chrome_options.add_argument("--disable-extensions")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--no-sandbox")
        self.driver = webdriver.Chrome(options=chrome_options, executable_path='C:\\Users\\Administrator\\AppData\\Local\\Google\\Chrome\\Application\\chromedriver.exe')

    def save_to_json(self, item, filename):
        with open(filename, 'a', encoding='utf-8') as f:
            json.dump(item, f, ensure_ascii=False)
            f.write('\n')

    def close(self):
        self.driver.quit()

    # TODO
    # 农业网-农业科技
    def agronet(self, stop_event):
        self.driver.get('http://www.agronet.com.cn/Tech/List.html')
        self.driver.implicitly_wait(60)

        # 获取行业
        industrys = self.driver.find_elements(By.XPATH, '//dl[@class="product_classification_nav"]/dd/ul/li/a')
        item = {}
        order = 0
        # 点击各个行业
        for m, industry in enumerate(industrys):
            if stop_event.is_set():
                break
            item["industry"] = industry.text
            industry.click()
            # 确保页面正确到达
            # WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//dl[@class="arrow_700"]/dt/span/em[2]'), item["industry"]))
            articles = self.driver.find_elements(By.XPATH, '//dl[@class="arrow_700"]/dd/ul/li')

            while True:
                if stop_event.is_set():
                    break
                for i, article in enumerate(articles):
                    if stop_event.is_set():
                        break
                    item["order"] = order

                    try:
                        WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//dl[@class="arrow_700"]/dt/em'), "农业技术文章列表"))
                    except TimeoutException:
                        continue
                    # 文章标题
                    article = self.driver.find_elements(By.XPATH, '//dl[@class="arrow_700"]/dd/ul/li/span/a')[i]
                    item["title"] = article.text
                    item["date"] = re.search(r'\d{4}-\d{1,2}-\d{1,2}', self.driver.find_element(By.XPATH, '//dl[@class="arrow_700"]/dd/ul/li/div').text).group()
                    item["source"] = self.driver.find_element(By.XPATH, '//dl[@class="arrow_700"]/dd/ul/li/em').text

                    # 点击文章
                    article.click()

                    # 获取所有打开的窗口句柄
                    window_handles = self.driver.window_handles
                    # 切换标签页
                    self.driver.switch_to.window(window_handles[-1])
                    try:
                        # 获取内容
                        content = self.driver.find_elements(By.XPATH, '//div[@class="font_bottom"]/p')
                        content_lists = [c.text.strip() for c in content]
                        item["content"] = [''.join(content_lists)]
                    except:
                        item["content"] = []

                    # 写入文件
                    self.save_to_json(item, './results/agronet.json')

                    # 关闭新标签页
                    self.driver.close()

                    # 切换回原始的标签页
                    self.driver.switch_to.window(self.driver.window_handles[0])

                    order += 1

                # 点击下一页
                try:
                    if stop_event.is_set():
                        break
                    next_page = self.driver.find_element(By.XPATH, '//a[contains(text(), "下一页")]')
                    next_page.click()
                    if self.driver.current_url == 'http://www.agronet.com.cn/Message/Error?aspxerrorpath=/Tech/List':
                        break
                except:
                    break

    # 中国农网-三农头条
    def farmer(self, stop_event):
        self.driver.get('https://www.farmer.com.cn/farmer/xw/sntt/list.shtml')

        # 获取所有文章
        articles = self.driver.find_elements(By.XPATH, '//div[contains(@class, "index-font")]')
        item = {}
        order = 0
        # 点击文章
        while True:
            if stop_event.is_set():
                break
            for article in articles:
                if stop_event.is_set():
                    break
                item["order"] = order
                item["title"] = article.text
                article.click()
                # 获取所有打开的窗口句柄
                window_handles = self.driver.window_handles
                # 切换标签页
                self.driver.switch_to.window(window_handles[-1])
                # 确保到达文章详情标签页
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[@class="index-title"]/span[3]'), "详情"))
                except TimeoutException:
                    continue

                item["author"] = self.driver.find_element(By.XPATH, '//div[@class="index-introduce"]/ul/li[2]/span').text
                item["date"] = self.driver.find_element(By.XPATH, '//div[@class="index-introduce"]/ul/div/span').text
                item["source"] = self.driver.find_element(By.XPATH, '//div[@class="index-introduce"]/ul/li[1]/span').text
                content = self.driver.find_elements(By.XPATH, '//div[@class="textList"]/p')
                content_lists = [c.text.strip() for c in content]
                item["content"] = [''.join(content_lists)]

                # 写入文件
                self.save_to_json(item, './results/farmer.json')

                # 关闭新标签页
                self.driver.close()

                # 切换回原始的标签页
                self.driver.switch_to.window(self.driver.window_handles[0])

                order += 1

            # 点击下一页
            try:
                if stop_event.is_set():
                    break
                next_page = self.driver.find_element(By.XPATH, '//a[contains(text(), "下一页")]')
                next_page.click()
            except:
                break

    # 中国农业农村信息网-数据-市场动态
    def agri(self,stop_event):
        self.driver.get('http://www.agri.cn/sj/scdt/')

        item = {}
        order = 0
        while True:
            if stop_event.is_set():
                break
            # 获取所有文章
            articles = self.driver.find_elements(By.XPATH, '//ul[@class="nxw_list_ul"]/li/div/div/p[1]/a')

            for i, article in enumerate(articles):
                if stop_event.is_set():
                    break
                item["order"] = order
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[contains(@class, "title_common") and contains(@class, "title_common_w")]'), "市场动态"))
                except TimeoutException:
                    continue

                # 点击文章
                article = self.driver.find_elements(By.XPATH, '//ul[@class="nxw_list_ul"]/li/div/div/p[1]/a')[i]
                article.click()

                # 获取所有打开的窗口句柄
                window_handles = self.driver.window_handles
                # 切换标签页
                self.driver.switch_to.window(window_handles[-1])

                # 确保到达文章详情标签页
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[@class="bread_nav"]/a[3]'), "市场动态"))
                except TimeoutException:
                    continue

                item["title"] = self.driver.find_element(By.XPATH, '//div[@class="detailCon_info_tit"]').text
                item["date"] = re.search(r'\d{4}-\d{1,2}-\d{1,2}', self.driver.find_element(By.XPATH, '//div[@class="updateInfo_mess"]/span[1]').text).group()
                item["source"] = re.search(r'来源:(.+)', self.driver.find_element(By.XPATH, '//div[@class="updateInfo_mess"]/span[3]').text).group()
                try:
                    content = self.driver.find_elements(By.XPATH, '//div[contains(@class, "content_body_box") and contains(@class, "ArticleDetails")]/p')
                    content_lists = [c.text.strip() for c in content]
                    item["content"] = [''.join(content_lists)]
                except:
                    item["content"] = []

                # 写入文件
                self.save_to_json(item, './results/agri.json')

                # 关闭新标签页
                self.driver.close()

                # 切换回原始的标签页
                self.driver.switch_to.window(self.driver.window_handles[0])

                order += 1

            # 点击下一页
            pages_sum = re.search(r'\d+', self.driver.find_element(By.XPATH, '//font[contains(@class, "clear") and contains(@class, "jump_con")]/span[2]').text).group()
            pages_cur = re.search(r'\d+', self.driver.find_element(By.XPATH, '//a[@class=" cur"]').text).group()

            if pages_cur != pages_sum:
                next_page = self.driver.find_element(By.XPATH, '//a[contains(text(), "下一页")]')
                next_page.click()
            else:
                break

    # 新农网-农业新闻-行业资讯
    def xinnong(self, stop_event):
        self.driver.get('http://www.xinnong.net/news/hangye/list_14_1.html')
        self.driver.implicitly_wait(60)

        item = {}
        order = 0
        while True:
            if stop_event.is_set():
                break
            # 获取所有文章
            articles = self.driver.find_elements(By.XPATH, '//div[@class="newslist"]/ul/li/a')

            for i, article in enumerate(articles):
                if stop_event.is_set():
                    break
                item["order"] = order
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[@class="lsttit"]/h1'), "行业资讯"))
                except TimeoutException:
                    continue

                # 点击文章
                article = self.driver.find_elements(By.XPATH, '//div[@class="newslist"]/ul/li/a')[i]
                article.click()

                # 获取所有打开的窗口句柄
                window_handles = self.driver.window_handles
                # 切换标签页
                self.driver.switch_to.window(window_handles[-1])

                # 确保到达文章详情标签页
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[@class="spos"]/a[3]'), "行业资讯"))
                except TimeoutException:
                    continue

                item["title"] = self.driver.find_element(By.XPATH, '//div[@class="arctit"]/h1').text
                item["date"] = re.search(r'\d{4}-\d{1,2}-\d{1,2}', self.driver.find_element(By.XPATH, '//div[@class="arcinfo"]').text).group()
                item["source"] = re.search(r'来源:(.+)', self.driver.find_element(By.XPATH, '//div[@class="arcinfo"]').text).group()
                content = self.driver.find_elements(By.XPATH, '//div[@class="arcont"]/p')
                content_lists = [c.text.strip() for c in content]
                item["content"] = [''.join(content_lists)]

                # 写入文件
                self.save_to_json(item, './results/xinnong.json')

                # 关闭新标签页
                self.driver.close()

                # 切换回原始的标签页
                self.driver.switch_to.window(self.driver.window_handles[0])

                order += 1

            # 点击下一页
            try:
                if stop_event.is_set():
                    break
                next_page = self.driver.find_element(By.XPATH, '//a[contains(text(), "下一页")]')
                next_page.click()
            except:
                break

    # 富农网-行业资讯
    def richagri(self, stop_event):
        self.driver.get('http://www.richagri.com/news')

        item = {}
        order = 0
        while True:
            if stop_event.is_set():
                break
            # 获取所有文章
            articles = self.driver.find_elements(By.XPATH, '//div[@class="head"]/ul/li/a')

            for i, article in enumerate(articles):
                if stop_event.is_set():
                    break
                item["order"] = order
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[@class="head"]/div[2]/a[2]'), "行业资讯"))
                except TimeoutException:
                    continue

                # 点击文章
                article = self.driver.find_elements(By.XPATH, '//div[@class="head"]/ul/li/a')[i]
                article.click()

                # 确保到达文章详情标签页
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[@class="head"]/div[2]/a[2]'), "行业资讯"))
                except TimeoutException:
                    continue

                item["title"] = self.driver.find_element(By.XPATH, '//div[@class="head"]/b').text
                item["date"] = re.search(r'\d{4}-\d{1,2}-\d{1,2}', self.driver.find_element(By.XPATH, '//div[@class="head"]/font').text).group()
                content = self.driver.find_elements(By.XPATH, '//div[@class="head"]')
                content_lists = [c.text.strip('\n') for c in content]
                content_lists = re.search(r'时间:(\d{4}-\d{1,2}-\d{1,2})\n(.*?)(?=\n回顶部)', content_lists[0], re.S)
                item["content"] = [''.join(content_lists.group(2))]

                # 写入文件
                self.save_to_json(item, './results/richagri.json')

                # 返回原始的标签页
                self.driver.back()

                order += 1

            # 点击下一页
            try:
                if stop_event.is_set():
                    break
                next_page = self.driver.find_element(By.XPATH, '//a[contains(text(), "下页")]')
                next_page.click()
            except:
                break

    # 金农网-分析
    def jinnong(self, stop_event):
        self.driver.get('https://www.jinnong.cn/1002/')

        # 存储已经爬取的文章链接
        crawled_links = set()
        item = {}
        order = 0
        while True:
            if stop_event.is_set():
                break
            # 获取所有文章
            articles = self.driver.find_elements(By.XPATH, '//ul[@class="left-side-items"]/li/a')

            for i, article in enumerate(articles):
                if stop_event.is_set():
                    break
                article_link = article.get_attribute('href')
                if article_link not in crawled_links:
                    item["order"] = order
                    try:
                        WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//span[@class="current-path-2"]'), "农业市场分析"))
                    except TimeoutException:
                        continue

                    # 点击文章
                    article = self.driver.find_elements(By.XPATH, '//ul[@class="left-side-items"]/li/a')[i]
                    article.click()

                    # 获取所有打开的窗口句柄
                    window_handles = self.driver.window_handles
                    # 切换标签页
                    self.driver.switch_to.window(window_handles[-1])

                    # 确保到达文章详情标签页
                    try:
                        WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[@class="current-path"]/span[3]'), "正文"))
                    except TimeoutException:
                        continue

                    item["title"] = self.driver.find_element(By.XPATH, '//div[@class="article-title"]/h1').text
                    item["author"] = self.driver.find_element(By.XPATH, '//div[@class="article-title"]/div/div[1]/span[3]').text
                    item["date"] = self.driver.find_element(By.XPATH, '//div[@class="article-title"]/div/div[1]/span[1]').text
                    item["source"] = re.search(r'来源:(.+)', self.driver.find_element(By.XPATH, '//div[@class="article-title"]/div/div[1]/span[2]').text).group()
                    content = self.driver.find_elements(By.XPATH, '//div[@class="article-conte-infor"]')
                    content_lists = [c.text.strip() for c in content]
                    item["content"] = [''.join(content_lists)]

                    # 写入文件
                    self.save_to_json(item, './results/jinnong.json')

                    # 记录已爬取的链接
                    crawled_links.add(article_link)

                    # 关闭新标签页
                    self.driver.close()

                    # 切换回原始的标签页
                    self.driver.switch_to.window(self.driver.window_handles[0])

                    order += 1

            # 点击加载更多
            button = self.driver.find_element(By.CSS_SELECTOR, '.click_more a span').text
            if button == "点击加载更多":
                next_page = self.driver.find_element(By.CSS_SELECTOR, '.click_more a')
                self.driver.execute_script("arguments[0].click();", next_page)
            # 结束
            elif button == "已加载全部":
                break

    # 中国乡村振兴服务网-新闻动态-行业资讯
    def xczxfw(self, stop_event):
        self.driver.get('http://www.xczxfw.org.cn/news/12/list')

        item = {}
        order = 0
        while True:
            if stop_event.is_set():
                break
            # 获取所有文章
            articles = self.driver.find_elements(By.XPATH, '//div[@class="zdhd"]/dl/dd/p/a')
            # articles = self.driver.find_elements(By.XPATH, '//dl[@class="lb_dt"]/dd/p/a')

            for i, article in enumerate(articles):
                if stop_event.is_set():
                    break
                item["order"] = order

                # 点击文章
                article = self.driver.find_elements(By.XPATH, '//div[@class="zdhd"]/dl/dd/p/a')[i]
                # article = self.driver.find_elements(By.XPATH, '//dl[@class="lb_dt"]/dd/p/a')[i]
                article.click()

                # 获取所有打开的窗口句柄
                window_handles = self.driver.window_handles
                # 切换标签页
                self.driver.switch_to.window(window_handles[-1])

                # 确保到达文章详情标签页
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[@class="gy_1"]/span/a[2]'), "行业资讯"))
                except TimeoutException:
                    continue

                item["title"] = self.driver.find_element(By.XPATH, '//div[@class="zdhd"]/h1').text
                item["date"] = re.search(r'\d{4}-\d{1,2}-\d{1,2} \d{2}:\d{2}', self.driver.find_element(By.XPATH, '//div[@class="zdhd"]/h2').text).group()
                item["source"] = re.search(r'来源:(.+)', self.driver.find_element(By.XPATH, '//div[@class="zdhd"]/h2').text).group()
                content = self.driver.find_elements(By.XPATH, '//div[@class="com_de"]/p')
                content_lists = [c.text.strip() for c in content]
                item["content"] = [''.join(content_lists)]

                # 写入文件
                self.save_to_json(item, './results/xczxfw.json')

                # 关闭新标签页
                self.driver.close()

                # 切换回原始的标签页
                self.driver.switch_to.window(self.driver.window_handles[0])

                order += 1

            # 点击下一页
            try:
                if stop_event.is_set():
                    break
                next_page = self.driver.find_element(By.XPATH, '//a[contains(text(), "下一页")]')
                next_page.click()
            except:
                break

    # 农博网-农博数据中心-实用技术
    def shujuaweb(self, stop_event):
        self.driver.get('http://shuju.aweb.com.cn/technology/technology-0-1.shtml')

        item = {}
        order = 0
        while True:
            if stop_event.is_set():
                break
            # 获取所有文章
            articles = self.driver.find_elements(By.XPATH, '//ul[@class="newList2"]/li/a[2]')

            for i, article in enumerate(articles):
                if stop_event.is_set():
                    break
                item["order"] = order
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//h2[@class="h2s"]'), "实用技术:"))
                except TimeoutException:
                    continue

                # 点击文章
                article = self.driver.find_elements(By.XPATH, '//ul[@class="newList2"]/li/a[2]')[i]
                article.click()

                # 获取所有打开的窗口句柄
                window_handles = self.driver.window_handles
                # 切换标签页
                self.driver.switch_to.window(window_handles[-1])

                try:
                    # 确保到达文章详情标签页
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//ul[@class="sub"]/li[7]/a/span'), "实用技术"))
                except TimeoutException:
                    continue

                item["title"] = self.driver.find_element(By.XPATH, '//div[@id="content"]/div[1]/div[1]/div/p[1]').text
                item["date"] = re.search(r'\d{4}年\d{1,2}月\d{1,2}日 \d{2}:\d{2}', self.driver.find_element(By.XPATH, '//div[@id="content"]/div[1]/div[1]/div/p[2]/span').text).group()
                # 避免来源为空报错
                source_element = self.driver.find_element(By.XPATH, '//div[@id="content"]/div[1]/div[1]/div/p[2]/span')
                source_match = re.search(r'来源:(.+)', source_element.text)
                item["source"] = source_match.group() if source_match else ""
                content = self.driver.find_elements(By.XPATH, '//ul[@class="name"]/following-sibling::p')
                content_lists = [c.text.strip() for c in content]
                item["content"] = [''.join(content_lists)]

                # 写入文件
                self.save_to_json(item, './results/shujuaweb.json')

                # 关闭新标签页
                self.driver.close()

                # 切换回原始的标签页
                self.driver.switch_to.window(self.driver.window_handles[0])

                order += 1

            # 点击下一页
            try:
                if stop_event.is_set():
                    break
                next_page = self.driver.find_element(By.XPATH, '//a[contains(text(), "下一页")]')
                next_page.click()
            except:
                break

    # 三农综合信息服务平台-12316头条-动态
    def agri_12316(self, stop_event):
        self.driver.get('http://12316.agri.cn/news/A12316dt/index.html')
        self.driver.implicitly_wait(60)

        item = {}
        order = 0
        while True:
            if stop_event.is_set():
                break
            # 获取所有文章
            articles = self.driver.find_elements(By.XPATH, '//ul[@class="dongtai_list"]/table[last()]/tbody/tr/td/li/a/p')

            for i, article in enumerate(articles):
                if stop_event.is_set():
                    break
                item["order"] = order

                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//p[@class="weizhi"]'), "12316头条-动态"))
                except TimeoutException:
                    continue

                # 点击文章
                article = self.driver.find_elements(By.XPATH, '//ul[@class="dongtai_list"]/table[last()]/tbody/tr/td/li/a/p')[i]
                article.click()

                # 确保到达文章详情标签页
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[@class="content"]/p[1]'), "正文"))
                except TimeoutException:
                    continue

                item["title"] = self.driver.find_element(By.XPATH, '//div[@class="detail_box"]/h3').text
                item["date"] = self.driver.find_element(By.XPATH, '//p[@class="zuozhe"]/span[1]/i').text
                item["source"] = self.driver.find_element(By.XPATH, '//p[@class="zuozhe"]/span[2]/i').text
                content = self.driver.find_elements(By.XPATH, '//div[@class="news_box"]')
                content_lists = [c.text.strip() for c in content]
                item["content"] = [''.join(content_lists)]

                # 写入文件
                self.save_to_json(item, './results/agri_12316.json')

                # 返回原始的标签页
                self.driver.back()

                order += 1

            # 点击下一页
            try:
                if stop_event.is_set():
                    break
                next_page = self.driver.find_element(By.XPATH, '//a[contains(text(), "下一页")]')
                next_page.click()
            except:
                break

    # 吾谷网-农技通
    def wugu(self, stop_event):
        self.driver.get('http://www.wugu.com.cn/?cat=6')
        self.driver.implicitly_wait(60)

        item = {}
        order = 0

        # 获取总页数
        try:
            pages = self.driver.find_elements(By.XPATH, '//div[@class="nav-links"]/a')
            total_pages = int(pages[1].text)
        except (ValueError, IndexError):
            total_pages = 1

        for current_page in range(1, total_pages + 1):
            if stop_event.is_set():
                break
            # 打开每一页
            page_url = f'http://www.wugu.com.cn/?paged={current_page}&cat=6'
            self.driver.get(page_url)

            # 获取所有文章
            articles = self.driver.find_elements(By.XPATH, '//div[@class="mg-posts-sec-inner"]/article/div/h4/a')

            for i, article in enumerate(articles):
                if stop_event.is_set():
                    break
                item["order"] = order
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[@class="mg-breadcrumb-title"]/h1'), "分类:农技通"))
                    # 点击文章
                    article = self.driver.find_elements(By.XPATH, '//div[@class="mg-posts-sec-inner"]/article/div/h4/a')[i]
                    self.driver.execute_script("arguments[0].click();", article)
                except TimeoutException:
                    continue

                # 确保到达文章详情标签页
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[@class="mg-header"]/div[1]/a'), "农技通"))
                except TimeoutException:
                    continue

                item["title"] = self.driver.find_element(By.XPATH, '//div[@class="mg-header"]/h1/a').text
                item["date"] = self.driver.find_element(By.XPATH, '//div[@class="mg-header"]/div[2]/div/span[1]').text
                content = self.driver.find_elements(By.XPATH, '//div[@class="mg-blog-post-box"]/article/p')
                content_lists = [c.text.strip() for c in content]
                item["content"] = [''.join(content_lists)]

                # 写入文件
                self.save_to_json(item, './results/wugu.json')

                # 返回原始的标签页
                self.driver.back()

                order += 1

    # 新农村商网-农业资讯-政策法规
    def mofcom(self, stop_event):
        self.driver.get('http://nc.mofcom.gov.cn/nyzx/zcfg')
        self.driver.implicitly_wait(60)

        item = {}
        order = 0

        while True:
            if stop_event.is_set():
                break
            # 获取所有文章
            articles = self.driver.find_elements(By.XPATH, '//div[@id="showList"]/div//ul/li/h5/a')

            for i, article in enumerate(articles):
                if stop_event.is_set():
                    break
                item["order"] = order
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[contains(@class, "w") and contains(@class, "u-nav-wrap")]/span'), "政策法规"))
                except TimeoutException:
                    continue

                # 重新获取文章列表
                articles = self.driver.find_elements(By.XPATH, '//div[@id="showList"]/div//ul/li/h5/a')  # 重新获取文章列表
                article = articles[i]  # 重新获取对应的文章元素

                article_link = self.driver.find_element(By.XPATH, '//div[@id="showList"]/div//ul/li/h5/a')

                try:
                    # 点击文章链接
                    self.driver.execute_script("arguments[0].click();", article_link)
                except StaleElementReferenceException:
                    continue

                # 获取所有打开的窗口句柄
                window_handles = self.driver.window_handles
                # 切换标签页
                self.driver.switch_to.window(window_handles[-1])

                # 确保到达文章详情标签页
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[contains(@class, "w") and contains(@class, "u-nav-wrap")]'), "正文"))
                except TimeoutException:
                    continue

                item["title"] = self.driver.find_element(By.XPATH, '//h3[@class="u-tt"]').text
                item["date"] = self.driver.find_element(By.XPATH, '//span[@class="u-time"]').text
                # 避免来源为空报错
                source_element = self.driver.find_element(By.XPATH, '//span[@class="u-source"]')
                source_match = re.search(r'信息来源:(.+)', source_element.text)
                item["source"] = source_match.group() if source_match else ""
                content = self.driver.find_elements(By.XPATH, '//div[@class="u-txt"]/p')
                content_lists = [c.text.strip() for c in content]
                item["content"] = [''.join(content_lists)]

                # 写入文件
                self.save_to_json(item, './results/mofcom.json')

                # 关闭新标签页
                self.driver.close()

                # 切换回原始的标签页
                self.driver.switch_to.window(self.driver.window_handles[0])

                order += 1

            # 点击下一页
            try:
                if stop_event.is_set():
                    break
                next_page = WebDriverWait(self.driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "下一页")]')))
                self.driver.execute_script("arguments[0].click();", next_page)
            except:
                break

    # 惠农网-行业资讯-行情资讯
    def cnhnb(self, stop_event):
        self.driver.get('https://news.cnhnb.com/hqjd/?pi=1')

        item = {}
        order = 0
        while True:
            if stop_event.is_set():
                break
            # 获取所有文章
            articles = self.driver.find_elements(By.XPATH, '//div[@class="latest-list"]/div/div[2]/div[1]/a')

            for i, article in enumerate(articles):
                if stop_event.is_set():
                    break
                item["order"] = order
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[@class="ci_crumbs"]/dl/dd[2]'), "行情解读"))
                except TimeoutException:
                    continue

                # 分类
                item["classify"] = self.driver.find_element(By.XPATH, '//span[@class="ct-s"]').text

                # 点击文章
                article = self.driver.find_elements(By.XPATH, '//div[@class="latest-list"]/div/div[2]/div[1]/a')[i]
                article.click()

                # 确保到达文章详情标签页
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[@class="ci_crumbs"]/dl/dd[2]/a'), "行情解读"))
                except TimeoutException:
                    continue

                item["title"] = self.driver.find_element(By.XPATH, '//div[@class="title"]/h1').text
                item["date"] = re.search(r'\d{4}-\d{1,2}-\d{1,2} \d{2}:\d{2}', self.driver.find_element(By.XPATH, '//div[@class="d-tips"]').text).group()
                # 避免来源为空报错
                source_element = self.driver.find_element(By.XPATH, '//div[@class="d-tips"]')
                source_match = re.search(r'来源:([^采编]+)', source_element.text)
                item["source"] = source_match.group(1) if source_match else ""
                content = self.driver.find_elements(By.XPATH, '//div[@class="content"]/p')
                content_lists = [c.text.strip() for c in content]
                item["content"] = [''.join(content_lists)]

                # 写入文件
                self.save_to_json(item, './results/cnhnb.json')

                # 返回原始的标签页
                self.driver.back()

                order += 1

            # 点击下一页
            try:
                if stop_event.is_set():
                    break
                next_page = self.driver.find_element(By.XPATH, '//button[@class="btn-next"]')
                WebDriverWait(self.driver, 20).until(EC.element_to_be_clickable(next_page))
                next_page.click()
            except:
                break

    # # 农一网 商城(舍弃)
    # def agri_16899(self):
    #     self.driver.get('https://www.16899.com/Product/ProductList.html')
    #     self.driver.implicitly_wait(60)
    #
    #     item = {}
    #     order = 0
    #     while True:
    #         # 获取所有文章
    #         articles = self.driver.find_elements(By.XPATH, '//div[@id="postData"]/div[1]/dl/dd[1]/a')
    #
    #         for i, article in enumerate(articles):
    #             item["order"] = order
    #             try:
    #                 WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//li[@class="selected"]/a'), "农资商城"))
    #             except TimeoutException:
    #                 continue
    #
    #             # 重新获取文章列表
    #             article = self.driver.find_elements(By.XPATH, '//div[@id="postData"]/div[1]/dl/dd[1]/a')[i]
    #             # 点击文章
    #             article.click()
    #
    #             # 获取所有打开的窗口句柄
    #             window_handles = self.driver.window_handles
    #             # 切换标签页
    #             self.driver.switch_to.window(window_handles[-1])
    #
    #             # 确保到达文章详情标签页
    #             try:
    #                 WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//li[@class="active"]/a'), "产品介绍"))
    #             except TimeoutException:
    #                 continue
    #
    #             # 分类
    #             item["classify"] = self.driver.find_element(By.XPATH, '//h1[@class="pro_name"]/span[1]').text
    #
    #             item["title"] = self.driver.find_element(By.XPATH, '//h1[@class="pro_name"]/span[2]').text
    #             item["price"] = self.driver.find_element(By.XPATH, '//span[@id="strongprice"]').text
    #
    #             # 写入文件
    #             self.save_to_json(item, 'agri_16899.json')
    #
    #             # 关闭新标签页
    #             self.driver.close()
    #
    #             # 切换回原始的标签页
    #             self.driver.switch_to.window(self.driver.window_handles[0])
    #
    #             order += 1
    #
    #         # 点击下一页
    #         try:
    #             next_page = self.driver.find_element(By.XPATH, '//a[contains(text(), "下页")]')
    #             if next_page.get_attribute("class") == "disabled":
    #                 break
    #             else:
    #                 next_page.click()
    #         except:
    #             break

    # 191农资人-精华帖-植保技术
    def agri_191(self, stop_event):
        self.driver.get('https://www.191.cn/searcher.php?digest=1&starttime=&endtime=&fid=3')
        self.driver.implicitly_wait(60)

        item = {}
        order = 0

        # 获取总页数
        try:
            pages = self.driver.find_elements(By.XPATH, '//div[@class="pages"]/a')
            total_pages = int(pages[-1].text)
        except (ValueError, IndexError):
            total_pages = 1

        for current_page in range(1, total_pages + 1):
            if stop_event.is_set():
                break
            # 打开每一页
            page_url = f'https://www.191.cn/searcher.php?type=special&condition=digest&authorid=&fid=3&starttime=&endtime=&page={current_page}'
            self.driver.get(page_url)

            # 获取所有文章
            articles = self.driver.find_elements(By.XPATH, '//div[@class="dlA"]/dl/dt/a')

            for i, article in enumerate(articles):
                if stop_event.is_set():
                    break
                item["order"] = order
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//li[@id="fid_3"]/a'), "植保技术"))
                except TimeoutException:
                    continue

                # 重新获取文章列表
                article = self.driver.find_elements(By.XPATH, '//div[@class="dlA"]/dl/dt/a')[i]
                # 点击文章
                article.click()

                # 获取所有打开的窗口句柄
                window_handles = self.driver.window_handles
                # 切换标签页
                self.driver.switch_to.window(window_handles[-1])

                # 确保到达文章详情标签页
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//td[@id="td_tpc"]/div[1]/a'), "楼主"))
                except TimeoutException:
                    continue

                item["title"] = self.driver.find_element(By.XPATH, '//h1[@id="subject_tpc"]').text
                item["date"] = self.driver.find_element(By.XPATH, '//td[@id="td_tpc"]/div[1]/span[2]').text
                content = self.driver.find_elements(By.XPATH, '//div[@id="read_tpc"]')
                content_lists = [c.text.strip() for c in content]
                item["content"] = [''.join(content_lists)]

                # 写入文件
                self.save_to_json(item, './results/agri_191.json')

                # 关闭新标签页
                self.driver.close()

                # 切换回原始的标签页
                self.driver.switch_to.window(self.driver.window_handles[0])

                order += 1

GUI界面代码

main.py

# 同时爬取所有网站
# import traceback
# import concurrent.futures
# from webcrawl import AgriInfoSpider
#
#
# def run_spider(spider_method):
#     spider = AgriInfoSpider()
#
#     try:
#         spider_method()
#     except Exception as e:
#         print(f"{spider_method.__name__} 爬虫发生错误: {str(e)}")
#         traceback.print_exc()
#
#     spider.close()
#
#
# if __name__ == "__main__":
#     spider_methods = [
#         AgriInfoSpider().agronet,
#         AgriInfoSpider().farmer,
#         AgriInfoSpider().agri,
#         AgriInfoSpider().xinnong,
#         AgriInfoSpider().richagri,
#         AgriInfoSpider().jinnong,
#         AgriInfoSpider().xczxfw,
#         AgriInfoSpider().shujuaweb,
#         AgriInfoSpider().agri_12316,
#         AgriInfoSpider().wugu,
#         AgriInfoSpider().mofcom,
#         AgriInfoSpider().cnhnb,
#         AgriInfoSpider().agri_191,
#     ]
#
#     with concurrent.futures.ThreadPoolExecutor() as executor:
#         executor.map(run_spider, spider_methods)

import tkinter as tk
from tkinter import messagebox
from threading import Thread, Event
from webcrawl import AgriInfoSpider

class App:
    def __init__(self, root):
        self.running_spiders = {}  # 跟踪已经运行的爬虫
        self.root = root
        self.root.title("农业信息爬虫")
        self.root.geometry("900x400")

        self.create_listbox()
        self.create_stop_button()
        self.create_run_button()

    def create_listbox(self):
        self.listbox = tk.Listbox(self.root, bd=0, highlightthickness=0, bg=self.root.cget('bg'), selectmode=tk.SINGLE, font=('楷体', 12))
        self.listbox.pack(side=tk.LEFT, fill=tk.BOTH, expand=True, padx=100, pady=70)

        self.spider_functions = {
            "农业网-农业科技": "agronet",
            "中国农网-三农头条": "farmer",
            "中国农业农村信息网-数据-市场动态": "agri",
            "新农网-农业新闻-行业资讯": "xinnong",
            "富农网-行业资讯": "richagri",
            "金农网-分析": "jinnong",
            "中国乡村振兴服务网-新闻动态-行业资讯": "xczxfw",
            "农博网-农博数据中心-实用技术": "shujuaweb",
            "三农综合信息服务平台-12316头条-动态": "agri_12316",
            "吾谷网-农技通": "wugu",
            "新农村商网-农业资讯-政策法规": "mofcom",
            "惠农网-行业资讯-行情资讯": "cnhnb",
            "191农资人-精华帖-植保技术": "agri_191"
        }

        for spider_name in self.spider_functions:
            self.listbox.insert(tk.END, spider_name)

        scrollbar = tk.Scrollbar(self.root, command=self.listbox.yview)
        scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
        self.listbox.config(yscrollcommand=scrollbar.set)

    def create_run_button(self):
        self.run_button = tk.Button(self.root, text="运行", command=self.run_spider, font=('黑体', 12))
        self.run_button.pack(side=tk.RIGHT, padx=(0, 20), pady=50)

    def create_stop_button(self):
        self.stop_button = tk.Button(self.root, text="停止", command=self.stop_spider, state=tk.DISABLED, font=('黑体', 12))
        self.stop_button.pack(side=tk.RIGHT, padx=(20, 150), pady=50)

    def run_spider(self):
        selected_index = self.listbox.curselection()
        if not selected_index:
            messagebox.showwarning("警告", "请先选择一个爬虫")
            return

        selected_item = self.listbox.get(selected_index)
        spider_name = self.spider_functions.get(selected_item)

        if spider_name:
            if spider_name in self.running_spiders:
                messagebox.showinfo("提示", f"{selected_item} 爬虫已经在运行")
            else:
                stop_event = Event()  # 创建一个Event对象
                spider = AgriInfoSpider()
                thread = Thread(target=self.run_spider_function, args=(spider, spider_name, stop_event))
                thread.start()
                self.running_spiders[spider_name] = {"thread": thread, "stop_event": stop_event}  # 将新运行的爬虫加入跟踪字典
                self.stop_button.config(state=tk.NORMAL)  # 启用停止按钮
        else:
            messagebox.showwarning("警告", "选择的爬虫不存在")

    def run_spider_function(self, spider, spider_name, stop_event):
        try:
            getattr(spider, spider_name)(stop_event)
        except Exception as e:
            messagebox.showerror("错误", f"爬虫运行出错: {e}")
        finally:
            self.root.after(0, self.update_stop_button, spider_name)

    def stop_spider(self):
        selected_index = self.listbox.curselection()
        if not selected_index:
            messagebox.showwarning("警告", "请先选择一个爬虫")
            return

        selected_item = self.listbox.get(selected_index)
        spider_name = self.spider_functions.get(selected_item)

        if spider_name and spider_name in self.running_spiders:
            Thread(target=self.stop_spider_thread, args=(spider_name, selected_item)).start()
        else:
            messagebox.showwarning("警告", "选择的爬虫不存在或未运行")

    def stop_spider_thread(self, spider_name, selected_item):
        spider_info = self.running_spiders[spider_name]
        spider_info["stop_event"].set()
        spider_info["thread"].join()

        self.root.after(0, self.update_stop_button, spider_name)

        messagebox.showinfo("提示", f"{selected_item} 爬虫已停止")

    def update_stop_button(self, spider_name):
        del self.running_spiders[spider_name]
        if not self.running_spiders:
            self.stop_button.config(state=tk.DISABLED)


if __name__ == "__main__":
    root = tk.Tk()
    app = App(root)
    root.mainloop()

运行的GUI界面及结果如下

最开始的界面:
GUI界面
点击运行:
点击运行
运行第二个爬虫:
运行第二个爬虫
停止运行:
停止运行

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:/a/306825.html

如若内容造成侵权/违法违规/事实不符,请联系我们进行投诉反馈qq邮箱809451989@qq.com,一经查实,立即删除!

相关文章

Linux 文件(夹)权限查看

命令 : ls -al ls -al 是一个用于列出指定目录下所有文件和子目录的命令,包括隐藏文件和详细信息。其中,-a 选项表示显示所有文件,包括以 . 开头的隐藏文件,-l 选项表示以列表的形式显示文件的详细信息。 本例中:drwxrwxr-x 为权限细节。 权限细节(Permission detail…

电子学会C/C++编程等级考试2020年12月(一级)真题解析

C/C++编程(1~8级)全部真题・点这里 第1题:字符三角形 描述 给定一个字符,用它构造一个底边长5个字符,高3个字符的等腰字符三角形。 输入 输入只有一行, 包含一个字符。 输出 该字符构成的等腰三角形,底边长5个字符,高3个字符。 样例输入 * 1 样例输出 * *** ***** 答…

云平台API服务

问题 云平台API服务 详细问题 笔者今天需要使用病虫害图像识别API,游览器搜索后,结果如下: 点击第一个搜索结果:RMB50000,虽然提供1000000次病虫害识别,但是笔者没有这般大地需求,有没有可能…

阿里云RDMA通信库XRDMA论文详解

RDMA(remote direct memory access)即远端直接内存访问,是一种高性能网络通信技术,具有高带宽、低延迟、无CPU消耗等优点。RDMA相比TCP在性能方面有明显的优势,但在编程复杂度上RDMA verbs却比TCP socket复杂一个数量级。 开源社区和各大云厂…

【小工具】pixi-live2d-display,直接可用的live2d的交互网页/桌面应用

效果&#xff1a; <script src"https://cubism.live2d.com/sdk-web/cubismcore/live2dcubismcore.min.js"></script> <script src"https://cdn.jsdelivr.net/gh/dylanNew/live2d/webgl/Live2D/lib/live2d.min.js"></script> <…

Postman工具使用一篇快速入门教程

文章目录 下载安装注册登录CollectionFolderRequestGet请求Post请求Header设置Response响应 EnvironmentsGlobal环境变量其他环境变量Collection变量变量使用同名变量的优先级 Postman内置变量Pre-request script和Test script脚本设置、删除和获取变量获取请求参数获取响应数据…

数据结构入门到入土——链表(完)LinkedList

目录 一&#xff0c;双向链表 1.单向链表的缺点 2.什么是双向链表&#xff1f; 3.自主实现双向链表 接口实现&#xff1a; 二&#xff0c;LinkedList 1.LinkedList的使用 1.1 什么是LinkedList&#xff1f; 1.2 LinkedList的使用 1.LinkedList的构造 2.LinkedList的…

GitLab clone 地址不对的解决办法

1丶问题描述 2丶解决方案 解决方案&#xff1a; 找到挂载到宿主机配置文件&#xff1a;gitlab.rb vi gitlab.rb 改成自己的ip 重启容器 docker restart gitlab 如果发现容器一直重启&#xff0c;可采用粗暴的方法&#xff0c;直接干掉当前容器&#xff0c;重新运行一个 …

AI数字人虚拟现实产业的发展现状与展望

AI数字人虚拟现实产业是当今科技领域备受瞩目的发展方向之一。随着人工智能和虚拟现实技术的迅猛发展&#xff0c;人们对于数字形象的需求不断增加&#xff0c;AI数字人虚拟现实产业正应运而生。本文将从产业现状和未来展望两个方面来描绘AI数字人虚拟现实产业的发展。 首先&a…

四、yolov8模型导出和查看

yolv8模型导出 1、找到engine文件夹下的exporter.py文件。 2、修改文件夹路径&#xff0c;改为我们训练结束后生成的文件夹。 3、打开default.yaml文件夹,找到format参数&#xff0c;修改为onnx&#xff0c;找到batch改为1,然后返回exporter.py文件&#xff0c;运行&#…

Ubuntu系统下安装TDengine Database

记录一下使用Ubuntu系统的安装TDengine Database管理软件工具 先查看一下系统的版本&#xff0c;可以看到这里使用的是Ubuntu20.04版本&#xff0c;版本代号focal mywmyw-S451LN:~$ uname -a Linux myw-S451LN 6.2.0-39-generic #40~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu …

Unity 实用方法 合集

Unity 实用方法 合集 Unity 打字机效果2D 坐标旋转计算球面坐标求值平滑移动鼠标位置获取2D屏幕坐标转世界坐标物体朝向目标多物体中心点生成本地图片加载画面线框显示画面线框显示 搭载效果 贝塞尔曲线绘制贝塞尔曲线绘制 搭载效果 网格弯曲网格弯曲 搭载效果 Delaunay 模型生…

2024年全国教资笔试报名流程(建议电脑报名),看看有啥新要求?

一.报名、考试时间节点 1.笔试报名时间: 2024年1月12日-15日 2.笔试考试时间:2024年3月9日 3.笔试成绩查询时间:2024年4月15日 4.面试报名时间:2024年4月15日 5.面试考试时间:2024年5月18日 6.面试成绩查询时间:2024年6月14日 二.笔试报名流程: 登陆→考生注册 →填报个…

获取深层次字段报错TypeError: Cannot read properties of undefined (reading ‘title‘)

动态生成菜单时报错,不能多层获取路由meta下面的title字段 <template><p>{{ meneList }}</p><template v-for"item in meneList" :key"item.path"><el-menu-item v-if"!item.children"><template #title>{…

6 个适用于 Android 手机的有效照片恢复工具

我们大多数人都经历过至少一次从智能手机中意外删除照片或视频的经历。是否可以恢复这些文件&#xff1f;幸运的是&#xff0c;答案是肯定的。如果您正在寻找高级 图片恢复应用程序 来从 Android 中检索已删除的内容&#xff0c;那么这正是这篇文章将要展示的内容。 6 个照片恢…

控制论和科学方法论

《控制论与科学方法论》&#xff0c;真心不错。 书籍原文电子版PDF&#xff1a;https://pan.quark.cn/s/aa40d59295df&#xff08;分类在学习目录下&#xff09; 备用链接&#xff1a;https://pan.xunlei.com/s/VNgj2vjW-Hf_543R2K8kbaifA1?pwd2sap# 控制论是一种让系统按照我…

《PCI Express体系结构导读》随记 —— 第I篇 第2章 PCI总线的桥与配置(15)

接前一篇文章&#xff1a;《PCI Express体系结构导读》随记 —— 第I篇 第2章 PCI总线的桥与配置&#xff08;14&#xff09; 2.3.1 PCI桥 在PCI Agent设备的配置空间中包含了许多寄存器&#xff0c;这些寄存器决定了该设备在PCI总线中的使用方法&#xff0c;本节不会全部介绍…

SCT2A27STER:5.5V-100V Vin,4A峰值限流,高效异步降压DCDC转换器,集成200mA LDO

特性&#xff1a; • 5.5V-100V 输入电压范围 • 最大输出电压&#xff1a;30V • 2A 连续输出电流 • 4A峰值电流限制 • 1.2V 1% 反馈电压 • 集成500mΩ 高侧功率 MOSFETs • 可选5V或者3.3V,输出一路200mA LDO • 25uA静态电流&#xff0c;VBIAS连接到高于6V的辅助电源 •…

从起高楼到楼塌了的中台战略 —— 业务中台、数据中台、技术中台

目录 一. 前言 二. 中台能力总体框架 三. 业务中台 四. 数据中台 五. 技术中台 5.1. API 网关 5.2. 开发框架 5.3. 微服务治理 5.4. 分布式数据库 5.5. 数据处理组件 六. 阿里拆中台的原因和意义 七. 总结 一. 前言 中台是近年来互联网行业的一个热门话题。它最早是…

Linux系统与windows系统设置定时任务的具体操作方法,如数据库自动备份等

设置定时备份 要设置数据库定时备份&#xff0c;你可以使用操作系统的定时任务功能来自动执行 backup.sh 脚本(此脚本可关注文末公众号回复04获取)。不同的操作系统有不同的方法来设置定时任务&#xff0c;但一般来说&#xff0c;你可以按照以下步骤进行操作&#xff1a; 打开…