HR给了一堆简历doc文件,要能用solr查询,所以我先分析存入数据库。
碰到的问题
-
.doc文件怎么读不了?哦这是后缀直接改成.doc的,实际是html文件
-
过滤html文件(包括html、script、style标签,大量的连续空格和回车)
-
怎么有乱码?设置一下UTF-8
-
怎么还有乱码?使用unescape将Html字符实体编码转换为汉字
-
提取简历中基本信息(注意数组越界以及信息的长度与数据库设置长度等问题)
-
JAVA中创建实体类
-
用mybatis写入数据库
分析过滤HTML文件测试代码
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringEscapeUtils;
public class Resume {
private static final String regEx_script = "<script[^>]*?>[\\s\\S]*?<\\/script>"; // 定义script的正则表达式
private static final String regEx_style = "<style[^>]*?>[\\s\\S]*?<\\/style>"; // 定义style的正则表达式
private static final String regEx_html = "<[^>]+>"; // 定义HTML标签的正则表达式
private static final String regEx_space = "\\s*|\t|\r|\n";// 定义空格回车换行符
private static final String regEx_w = "<w[^>]*?>[\\s\\S]*?<\\/w[^>]*?>";//定义所有w标签
public static String delHTMLTag(String htmlStr) {
Pattern p_w = Pattern.compile(regEx_w, Pattern.CASE_INSENSITIVE);
Matcher m_w = p_w.matcher(htmlStr);
htmlStr = m_w.replaceAll(""); // 过滤script标签
Pattern p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);
Matcher m_script = p_script.matcher(htmlStr);
htmlStr = m_script.replaceAll(""); // 过滤script标签
Pattern p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE);
Matcher m_style = p_style.matcher(htmlStr);
htmlStr = m_style.replaceAll(""); // 过滤style标签
Pattern p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);
Matcher m_html = p_html.matcher(htmlStr);
htmlStr = m_html.replaceAll(""); // 过滤html标签
htmlStr = htmlStr.replaceAll(" ", " "); // 过滤html中 字符
// Pattern p_space = Pattern.compile(regEx_space, Pattern.CASE_INSENSITIVE);
// Matcher m_space = p_space.matcher(htmlStr);
// htmlStr = m_space.replaceAll(""); // 过滤空格回车标签
htmlStr = htmlStr.replaceAll(" ", ""); //过滤
return htmlStr.trim(); // 返回文本字符串
}
public static void main(String[] args) throws Exception {
// InputStream in = new FileInputStream("path");
// OutputStream out = new FileOutputStream("path");
File input = new File("path");
File output = new File("path");
String fileInput = "path";
String fileOutput = "path";
String[] arr = input.list();
for (String fileName : arr) {
System.out.println(fileName);
if(fileName.endsWith(".doc")) {
fileInput = "path";
fileOutput = "path\\result.txt";
fileInput += fileName;
// fileOutput += fileName;
System.out.println(fileInput);
System.out.println(fileOutput);
InputStreamReader file = new InputStreamReader(new FileInputStream(fileInput),"UTF-8");
StringBuilder result = new StringBuilder();
StringBuilder basic = new StringBuilder();
try{
BufferedReader br = new BufferedReader(file);//构造一个BufferedReader类来读取文件
String s = null;
while((s = br.readLine())!=null){//使用readLine方法,一次读一行
result.append(System.lineSeparator()+s);
}
br.close();
}catch(Exception e){
e.printStackTrace();
}
String str = result.toString();
// HTML中许多换行通过<br/>实现,如果直接过滤掉会丢失换行
str = str.replaceAll("<br/>", "\n");
// 将字符实体编码转换为汉字
str = StringEscapeUtils.unescapeHtml(str);
// 去掉HTML标签
str = delHTMLTag(str);
// 去掉重复的空格
str = filterSpace(str);
// 去掉重复的回车
str = filterReturn(str);
// 提取姓名
String name = collectBasicInfo(str, "姓名:");
basic.append(name);
basic.append(",");
// 提取姓名
String gender = collectBasicInfo(str, "性别:");
basic.append(gender);
basic.append(",");
// 提取年龄
String age = collectBasicInfo(str, "年龄:");
basic.append(age);
basic.append(",");
// 提取工作年限
String workSeniority = collectBasicInfo(str, "工作年限:");
basic.append(workSeniority);
basic.append(",");
// 提取最高学历
String eduBackground = collectBasicInfo(str, "最高学历:");
basic.append(eduBackground);
basic.append(",");
// 提取手机号码
String cellPhone = collectBasicInfo(str, "手机号码:");
basic.append(cellPhone);
basic.append(",");
// 提取电子邮箱
String email = collectBasicInfo(str, "电子邮箱:");
basic.append(email);
// basic.append(" ");
str = basic.toString();
System.out.print(str);
OutputStreamWriter out = new OutputStreamWriter(new FileOutputStream(fileOutput, true),"UTF-8");
out.write(str.toCharArray());
out.write("\r\n");
out.flush();
out.close();
}
}
}
public static String filterSpace (String str) {
StringBuilder sb = new StringBuilder();
boolean isFirstSpace = false;//标记是否是第一个空格
char c;
for(int i = 0; i < str.length(); i++){
c = str.charAt(i);
if(c == ' ' || c == '\t'){//遇到空格字符时,先判断是不是第一个空格字符
if(!isFirstSpace) {
sb.append(c);
isFirstSpace = true;
}
} else {
sb.append(c);
isFirstSpace = false;
}
}
return sb.toString();
}
public static String filterReturn (String str) {
String result = "";
if (str != null) {
Pattern p = Pattern.compile("(\r?\n(\\s*\r?\n)+)");
Matcher m = p.matcher(str);
result = m.replaceAll("\r\n");
}
result = result.replaceAll("\r\n", "\n");
result = result.replaceAll("\n\r", "\n");
return result;
}
public static String collectBasicInfo (String str, String token) {
StringBuilder sb = new StringBuilder();
int indexOfName = str.indexOf(token);
if (indexOfName <= 0) {
return "";
}
for (int i = indexOfName; i < str.length(); i++) {
char c = str.charAt(i);
if (c == '\r' || c == '\n') {
for (int j = i+1; j < str.length(); j++) {
if (str.charAt(j) == '\r' || str.charAt(j) == '\n') {
if (str.substring(i+1,j).contains(":")) {
sb.append(" ");
} else {
sb.append(str.substring(i+1,j));
}
break;
}
}
break;
}
}
return sb.toString();
}
}