java解析xml
Java解析xml
遇到一个问题,要解析一个xml,网上搜了搜,都说有4中方式,试了试dom解析,感觉解析的时候开发效率太低,忽然想到Jsoup,然后就用了第5种方式Jsoup解析XML。
用Jsoup解析XML,开发效率确实是高,但是运行效率太低了。解析一个10K左右的xml要0.2s左右。300万的xml文件要解析到什么时候呀。
然后试了试Dom解析xml,效率提高了不少,解析一个10K左右的xml 0.05s左右,效率提高的不少。
当然,还有其他3种方式解析。知道Sax解析时占用内存小,可能会快一点,但是着急处理文件,暂时没有测试。
DOM SAX JDOM DOM4J Jsoup
import javax.xml.parsers.DocumentBuilderFactory;
import java.io.*;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.ParserConfigurationException;
import com.xxxx.usdp.odk.common.file.FileUtil;
import com.xxxx.usdp.xxxx.poc.yuyin.entity.XmlEntity;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.*;
import org.xml.sax.SAXException;
/**
* DOM方式解析xml
*
* @version V1.0
*/
public class DomParserXml {
private static final Logger log = LoggerFactory.getLogger(DomParserXml.class);
/**
* 测试解析
*/
@Test
public void testPaser(){
String xmlFilePath = "src/main/resources/2018010109013384362390728_1522286730680.xml";
xmlFilePath = "C:\\home\\user1\\xxxx\\2018010109000672062385406";
//xmlFilePath = "src/main/resources/test.xml";
XmlEntity xmlEntity = parserXml(xmlFilePath);
log.info("xml数据:\r\n{}", xmlEntity);
}
/**
* 批量解析
*/
@Test
public void batchParser(){
String filePath = "D:\\data\\210_1\\210_test";
String outDirPath = "D:\\data\\210_1\\210_201801_result";
File outDir = new File(outDirPath);
if(!outDir.exists()){
outDir.mkdirs();
}
File dir = new File(filePath);
File[] files = dir.listFiles();
int length = files.length;
for(int i = 0; i < 1000; i++){
File f = files[i];
XmlEntity xmlEntity = parserXml(f.getAbsolutePath());
try {
FileUtil.writeStringToFile(xmlEntity.getMix(), outDirPath+"/"+xmlEntity.getFileName()+".txt");
} catch (IOException e) {
log.error("写文件出错 {}", e.toString());
}
}
}
/**
* 把xml解析成对话格式
*
* @param xmlFilePath
*/
public static XmlEntity parserXml(String xmlFilePath) {
return parserXml(new File(xmlFilePath));
}
/**
* 把xml解析成对话格式
*
* @param f
*/
public static XmlEntity parserXml(File f) {
long t1 = System.currentTimeMillis();
long t2 = 0;
//1、创建一个DocumentBuilderFactory的对象
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
//2、创建一个DocumentBuilder的对象
Document document = null;
XmlEntity xmlEntity = new XmlEntity();
try {
//创建DocumentBuilder对象
DocumentBuilder db = dbf.newDocumentBuilder();
//3、通过DocumentBuilder对象的parser方法加载books.xml文件到当前项目下
/*注意导入Document对象时,要导入org.w3c.dom.Document包下的*/
//传入文件名可以是相对路径也可以是绝对路径
//document = db.parse(xmlFilePath);
document = db.parse(f);
xmlEntity.setFileName(f.getName().replace(".xml", ""));
t2 = System.currentTimeMillis();
log.info("读文件用时{}s", 1.0*(t2-t1)/1000);
} catch (ParserConfigurationException e) {
log.error("Dom解析Xml出错 {}", e.toString());
} catch (SAXException e) {
log.error("Dom解析Xml出错 {}", e.toString());
} catch (IOException e) {
log.error("Dom解析Xml出错 {}", e.toString());
}
Element instance = (Element) document.getElementsByTagName("instance").item(0);
// 文件保存地址
String waveuri = instance.getAttribute("waveuri");
log.debug("waveuri:{}",waveuri);
xmlEntity.setWaveuri(waveuri);
String duration = instance.getAttribute("duration");
log.debug("duration:{}",duration);
xmlEntity.setDuration(duration);
NodeList subjectNodes = document.getElementsByTagName("subject");
if(subjectNodes == null || subjectNodes.getLength() < 2){
log.error("文件格式错误,subject节点个数小于2个");
return null;
}
log.debug("subject节点个数:{}", subjectNodes.getLength());
/** 处理正文和时间片 */
Element subject1 = (Element) subjectNodes.item(1);
NodeList channels = subject1.getElementsByTagName("channel");
log.debug("channels 节点个数:{}", channels.getLength());
// channel0 n0
Element c1 = (Element) channels.item(0);
String tagname = c1.getTagName();
log.debug("tagname:{}" ,tagname);
Element textElementA = (Element) c1.getElementsByTagName("text").item(0);
Element timeElementA = (Element) c1.getElementsByTagName("time").item(0);
String textA = textElementA.getTextContent().trim();
log.debug("textA:|{}|", textA);
xmlEntity.setN0(textA);
String timeA = timeElementA.getTextContent().trim();
log.debug("timeA:|{}|", timeA);
String[] textArrayA = textA.split(" ");
String[] timeArrayA = timeA.split(" ");
int textLengthA = textArrayA.length;
log.debug("textLengthA:{}", textLengthA);
// channel1 n1
Element c2 = (Element) channels.item(1);
String tagname2 = c2.getTagName();
log.debug("tagname2:{}" ,tagname2);
Element textElementB = (Element) c2.getElementsByTagName("text").item(0);
Element timeElementB = (Element) c2.getElementsByTagName("time").item(0);
String textB = textElementB.getTextContent().trim();
log.debug("textB:|{}|", textB);
xmlEntity.setN1(textB);
String timeB = timeElementB.getTextContent().trim();
log.debug("timeB:|{}|", timeB);
String[] textArrayB = textB.split(" ");
String[] timeArrayB = timeB.split(" ");
int textLengthB = textArrayB.length;
log.debug("textLengthB:{}", textLengthB);
String n0 = "n0";
String n1 = "n1";
List<TimeTextEntity> timeTextList = new ArrayList<>(textLengthA +textLengthB);
if(textLengthA > 1){
// A
for(int i = 0; i < textLengthA; i++){
// 一个词语
String oneTerm = textArrayA[i];
// 时间片
String oneTime = timeArrayA[i];
String[] timeArraySub = oneTime.split(",");
int start = Integer.parseInt(timeArraySub[0]);
int end = Integer.parseInt(timeArraySub[1]);
TimeTextEntity t = new TimeTextEntity(start, end, oneTerm, n0);
timeTextList.add(t);
}
}
if(textLengthB >1){
// B
for(int i =0; i <textLengthB; i++){
// 一个词语
String oneTerm = textArrayB[i];
// 时间片
String oneTime = timeArrayB[i];
String[] timeArraySub = oneTime.split(",");
int start = Integer.parseInt(timeArraySub[0]);
int end = Integer.parseInt(timeArraySub[1]);
TimeTextEntity t = new TimeTextEntity(start, end, oneTerm, n1);
timeTextList.add(t);
}
}
long t4 = System.currentTimeMillis();
// 升序
Collections.sort(timeTextList, new Comparator<TimeTextEntity>() {
@Override
public int compare(TimeTextEntity o1, TimeTextEntity o2) {
return new Integer(o1.getStart()).compareTo(o2.getStart());
}
});
long t5 = System.currentTimeMillis();
log.info("排序用时{}s", 1.0*(t5-t4)/1000);
int allCount = timeTextList.size();
StringBuilder sb = new StringBuilder();
String flag = null;
for(int i =0; i < allCount; i++){
log.debug("{} {}", i, timeTextList.get(i));
TimeTextEntity entity = timeTextList.get(i);
String who = entity.getWho();
if(who.equals(flag)){
sb.append(entity.getText());
sb.append(" ");
}else{
sb.append("\r\n");
flag = who;
sb.append(flag);
sb.append(" : ");
sb.append(entity.getText());
sb.append(" ");
}
} // end for
xmlEntity.setMix(sb.toString());
long t3 = System.currentTimeMillis();
log.info("解析用时{}s", 1.0*(t3-t2)/1000);
log.info("总共用时{}s", 1.0*(t3-t1)/1000);
log.debug("对话:{}", sb);
return xmlEntity;
}
}
/**
* 时间段对象<br>
*/
class TimeEntity{
private int start;
private int end;
public TimeEntity(){
}
public TimeEntity(int start, int end){
this.start = start;
this.end = end;
}
public int getStart() {
return start;
}
public void setStart(int start) {
this.start = start;
}
public int getEnd() {
return end;
}
public void setEnd(int end) {
this.end = end;
}
@Override
public String toString() {
return "TimeEntity{" + "start='" + start + '\'' + ", end='" + end +
'\'' + '}';
}
}
/**
*
*/
class TimeTextEntity{
private int start;
private int end;
private String text;
/** n0 n1 */
private String who;
public TimeTextEntity(){
}
public TimeTextEntity(int start, int end, String text, String who){
this.start = start;
this.end = end;
this.text = text;
this.who = who;
}
public int getStart() {
return start;
}
public void setStart(int start) {
this.start = start;
}
public int getEnd() {
return end;
}
public void setEnd(int end) {
this.end = end;
}
public String getText() {
return text;
}
public void setText(String text) {
this.text = text;
}
public String getWho() {
return who;
}
public void setWho(String who) {
this.who = who;
}
@Override
public String toString() {
return "TimeTextEntity{" + "start=" + start + ", end=" + end + ", " +
"text='" + text + '\'' + ", who='" + who + '\'' + '}';
}
}