java实现图片爬取

开发环境

IntelliJ IDEA 2021.1.3

jdk1.8

Jsoup介绍

一款Java 的HTML解析器

jsoup 是一款Java 的HTML解析器，可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API，可通过DOM，CSS以及类似于jQuery的操作方法来取出和操作数据。

爬取图片

分析地址：https://pvp.qq.com/web201605/herolist.shtml

打开王者荣耀官网

按F12使用开发者工具检查元素

首先找到class="herolist clearfix"的ul标签

再找到对应英雄的li标签

其中a标签的文本内容就是该英雄的名字，a标签的href就是该英雄详细信息的相对位置

我们可以将a标签的相对位置获取下来在和"https://pvp.qq.com/web201605/"拼接一下获得该英雄详细信息的页面地址

在新获取的地址中继续按下F12进入开发者模式检查元素

其中class=zk-con1 zk-con的div标签中的style样式中的background属性中的地址即为背景图片地址。

以桑启英雄为例，该英雄图片地址为https://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/545/545-bigskin-1.jpg

在网页中尝试访问一下

效果如下：

代码实现

1.pom

  <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.15.3</version>
        </dependency>
 <!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
        <dependency>
            <groupId>commons-io</groupId>
            <artifactId>commons-io</artifactId>
            <version>2.11.0</version>
        </dependency>

2.使用jsoup获取网页代码

@Test
    void contextLoads() {
        //用Java写一段爬虫程序
        /**
         * 思路：
         *  1、建立一个连接（主网址）
         *      https://pvp.qq.com/web201605/herolist.shtml
         *  2、找到一个ul标签对象（所有的li）
         *  3、for循环每一个li
         *     读取里面的a --- href
         *     根据href创建新连接
         *     背景图
         *  4、通过 I/O 读取 （下载到本地）
         */
        try {
            //1、建立一个连接（主网址）
            // https://pvp.qq.com/web201605/herolist.shtml
            String url = "https://pvp.qq.com/web201605/herolist.shtml";
            Connection connect = Jsoup.connect(url);
            //2、通过连接获取一个 Document 对象
            Document document = connect.get();
            //3、通过 Document 对象,寻找里面的一个UL对象， 的class属性的 herolist
            Element elementUL = document.selectFirst("[class = herolist clearfix]");
            //4、通过 Element 标签对象,寻找里面的一个LI对象
            Elements elementLis = elementUL.select("li");
            //5、遍历上面这个 elementLis 集合
            for (Element elementLi : elementLis) {  // <li></li>
                //想要得到li中的哪个 <a href = "路径">鬼谷子</a>连接元素  名字  路径
                Element elementA = elementLi.selectFirst("a");
                //得到<a href = "路径">xxx</a>中间的哪个文字-------英雄名字
                String HeroName = elementA.text();//英雄名称
                //得到 href 中的属性值
                //https://pvp.qq.com/web201605/herolist.shtml
                //herodetail/545.shtml
                String heroURL = elementA.attr("href");  //主要目的的路径
                //发现得到的 heroURL  是一个相对的路径，拼接绝对的路径
                //https://pvp.qq.com/web201605/herodetail/545.shtml
                String path = " https://pvp.qq.com/web201605/" + heroURL;  //拼接英雄详细信息网页地址
 
                //根据这个path路径，去创建一个新的连接 （相当于，点击了）
                Connection newConnection = Jsoup.connect(path);
                //根据新的连接，获取新的Document 对象
                Document newDocument = newConnection.get();
                //通过新的Document获取一个div
                // <div class="zk-con1 zk-con">
                Element elementDiv = newDocument.selectFirst("[class=zk-con1 zk-con]");
                //获取这个div中的 style 属性值
                // style = "background:url('//game.gtimg.cn/images/yxzj/img201606/skin/hero-info/545/545-bigskin-1.jpg') center 0"
                String backgroundURL = elementDiv.attr("style");//获取英雄图片地址
                int length = backgroundURL.length();
                System.out.println("长度："+length);
                //找寻 两个单引号的位置
                int left = backgroundURL.indexOf("'");
                System.out.println(left);
                int right = backgroundURL.lastIndexOf("'");
                System.out.println(right);
                String newBG = backgroundURL.substring(left + 1, right);
                URL newUrl = new URL("https:" + newBG);
 
                System.out.println("地址："+newUrl);
 
                //以上的部分就是将网站上的一些标签做了一个分析
                System.out.println("下载:"+HeroName);
 
 
                // 输出流，读取刚才的  newUrl 路径对应的图片
                InputStream is = newUrl.openStream();
                // 输出流，写到我们的本地的硬盘上
                String newPath = "C://desk/爬取图片/"+HeroName+".jpg";
 
                FileOutputStream fos = new FileOutputStream(newPath);//写入path文件夹
 
                //下载图片
                IOUtils.copy(is, fos);
                fos.close();
                is.close();
            }
 
        } catch (IOException e) {
            e.printStackTrace();
        }
 
    }

效果