JAVA分析过滤HTML文件

09 Apr 2019

Reading time ~3 minutes
HR给了一堆简历doc文件，要能用solr查询，所以我先分析存入数据库。
碰到的问题

.doc文件怎么读不了？哦这是后缀直接改成.doc的，实际是html文件
过滤html文件（包括html、script、style标签，大量的连续空格和回车）
怎么有乱码？设置一下UTF-8
怎么还有乱码？使用unescape将Html字符实体编码转换为汉字
提取简历中基本信息（注意数组越界以及信息的长度与数据库设置长度等问题）
JAVA中创建实体类
用mybatis写入数据库
分析过滤HTML文件测试代码

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringEscapeUtils;

public class Resume {
	private static final String regEx_script = "<script[^>]*?>[\\s\\S]*?<\\/script>"; // 定义script的正则表达式
    private static final String regEx_style = "<style[^>]*?>[\\s\\S]*?<\\/style>"; // 定义style的正则表达式
    private static final String regEx_html = "<[^>]+>"; // 定义HTML标签的正则表达式
    private static final String regEx_space = "\\s*|\t|\r|\n";// 定义空格回车换行符
    private static final String regEx_w = "<w[^>]*?>[\\s\\S]*?<\\/w[^>]*?>";//定义所有w标签
    
    public static String delHTMLTag(String htmlStr) {
	Pattern p_w = Pattern.compile(regEx_w, Pattern.CASE_INSENSITIVE);
	Matcher m_w = p_w.matcher(htmlStr);
	htmlStr = m_w.replaceAll(""); // 过滤script标签
 
 
	Pattern p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);
	Matcher m_script = p_script.matcher(htmlStr);
	htmlStr = m_script.replaceAll(""); // 过滤script标签
 
 
	Pattern p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE);
	Matcher m_style = p_style.matcher(htmlStr);
	htmlStr = m_style.replaceAll(""); // 过滤style标签
 
 
	Pattern p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);
	Matcher m_html = p_html.matcher(htmlStr);
	htmlStr = m_html.replaceAll(""); // 过滤html标签
 
	htmlStr = htmlStr.replaceAll("&nbsp;", " "); // 过滤html中&nbsp;字符
//        Pattern p_space = Pattern.compile(regEx_space, Pattern.CASE_INSENSITIVE);
//        Matcher m_space = p_space.matcher(htmlStr);
//        htmlStr = m_space.replaceAll(""); // 过滤空格回车标签
 
 
	htmlStr = htmlStr.replaceAll(" ", ""); //过滤 
	return htmlStr.trim(); // 返回文本字符串
    }

    
    public static void main(String[] args) throws Exception {  
//    	InputStream in = new FileInputStream("path");
//    	OutputStream out = new FileOutputStream("path");
	
	
	File input = new File("path");
	File output = new File("path");
	String fileInput = "path";
	String fileOutput = "path";
	String[] arr = input.list();
	for (String fileName : arr) {
		System.out.println(fileName);
	    if(fileName.endsWith(".doc")) {
		fileInput = "path";
		fileOutput = "path\\result.txt";
		fileInput += fileName;
//            	fileOutput += fileName;
		System.out.println(fileInput);
		System.out.println(fileOutput);
		InputStreamReader file = new InputStreamReader(new FileInputStream(fileInput),"UTF-8");
		StringBuilder result = new StringBuilder();
		StringBuilder basic = new StringBuilder();
		
		try{
		    BufferedReader br = new BufferedReader(file);//构造一个BufferedReader类来读取文件
		    String s = null;
		    while((s = br.readLine())!=null){//使用readLine方法，一次读一行
			result.append(System.lineSeparator()+s);
		    }
		    br.close();    
		}catch(Exception e){
		    e.printStackTrace();
		}
		String str = result.toString();
		// HTML中许多换行通过<br/>实现，如果直接过滤掉会丢失换行
		str = str.replaceAll("<br/>", "\n");
		// 将字符实体编码转换为汉字
		str = StringEscapeUtils.unescapeHtml(str);
		// 去掉HTML标签
		str = delHTMLTag(str);
		// 去掉重复的空格
		str = filterSpace(str);
		// 去掉重复的回车
		str = filterReturn(str);
		
		// 提取姓名
		String name = collectBasicInfo(str, "姓名：");
		basic.append(name);
		basic.append("，");
		
		// 提取姓名
		String gender = collectBasicInfo(str, "性别：");
		basic.append(gender);
		basic.append("，");
		
		// 提取年龄
		String age = collectBasicInfo(str, "年龄：");
		basic.append(age);
		basic.append("，");
		
		// 提取工作年限
		String workSeniority = collectBasicInfo(str, "工作年限：");
		basic.append(workSeniority);
		basic.append("，");
		
		// 提取最高学历
		String eduBackground = collectBasicInfo(str, "最高学历：");
		basic.append(eduBackground);
		basic.append("，");
		
		// 提取手机号码
		String cellPhone = collectBasicInfo(str, "手机号码：");
		basic.append(cellPhone);
		basic.append("，");
		
		// 提取电子邮箱
		String email = collectBasicInfo(str, "电子邮箱：");
		basic.append(email);
//            	basic.append(" ");
		str = basic.toString();
		System.out.print(str);
		OutputStreamWriter out = new OutputStreamWriter(new FileOutputStream(fileOutput, true),"UTF-8");
		out.write(str.toCharArray());
		out.write("\r\n");
		out.flush();
		out.close();
	    }
	}
	
	

    }  
    
    public static String filterSpace (String str) {
	StringBuilder sb = new StringBuilder();
	boolean isFirstSpace = false;//标记是否是第一个空格
	char c;
	for(int i = 0; i < str.length(); i++){
		c = str.charAt(i);
		if(c == ' ' || c == '\t'){//遇到空格字符时,先判断是不是第一个空格字符
			if(!isFirstSpace) {
				sb.append(c);
				isFirstSpace = true;
			}
		} else {
			sb.append(c);
			isFirstSpace = false;
		}
	}
	return sb.toString();
    }
    
    public static String filterReturn (String str) {
	String result = "";
		if (str != null) {
			Pattern p = Pattern.compile("(\r?\n(\\s*\r?\n)+)");
			Matcher m = p.matcher(str);
			result = m.replaceAll("\r\n");
		}
		result = result.replaceAll("\r\n", "\n");
		result = result.replaceAll("\n\r", "\n");
		return result;
    }
    
    public static String collectBasicInfo (String str, String token) {
	StringBuilder sb = new StringBuilder();
	int indexOfName = str.indexOf(token);
	if (indexOfName <= 0) {
		return "";
	}
	for (int i = indexOfName; i < str.length(); i++) {
		char c = str.charAt(i);
		if (c == '\r' || c == '\n') {
			for (int j = i+1; j < str.length(); j++) {
				if (str.charAt(j) == '\r' || str.charAt(j) == '\n') {
					if (str.substring(i+1,j).contains("：")) {
						sb.append(" ");
					} else { 
						sb.append(str.substring(i+1,j));
					}
					break;
				}
			}
			break;
		}
	}
	return sb.toString();
    }

}
JAVA