代码
import org.apache.commons.lang.StringUtils;
import org.springframework.web.util.HtmlUtils;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* HTML 工具类
*/
public class HTMLUtils {
private static Pattern pImage= Pattern.compile("<img.*src\\s*=\\s*(.*?)[^>]*?>",Pattern.CASE_INSENSITIVE);
private static Pattern rImage=Pattern.compile("src\\s*=\\s*\"?(.*?)(\"|>|\\s+)");
/**
* 获取HTML内的文本,不包含标签
*
* @param html HTML 代码
*/
public static String getInnerText(String html) {
if (StringUtils.isNotBlank(html)) {
//去掉 html 的标签
String content = html.replaceAll("</?[^>]+>", "");
// 将多个空格合并成一个空格
content = content.replaceAll("( )+", " ");
// 反向转义字符
content = HtmlUtils.htmlUnescape(content);
return content.trim();
}
return "";
}
public static List<String> getImgStr(String richText) {
List<String> list = new ArrayList<>();
Matcher pMatcher = pImage.matcher(richText);
while (pMatcher.find()) {
// 得到<img />数据
String img = pMatcher.group();
// 匹配<img>中的src数据
Matcher rMatcher = rImage.matcher(img);
while (rMatcher.find()) {
list.add(rMatcher.group(1));
}
}
return list;
}
public static void main(String[] args) {
String str = HTMLUtils.getInnerText("<p>aaaaaaa</p><div>22222222222222</div><img src='http://192.168.1.120/'></img>");
List<String> images = HTMLUtils.getImgStr("<p>aaaaaaa</p><div>22222222222222</div><img src='http://192.168.1.120/aa.png'></img>");
System.out.println(str);
System.out.println(images);
}
}
执行效果: