0.前言
最近写论文用到了比较出名的DBLP数据集,详细介绍见DBLP官网
DBLP是以XML的形式给出的,类似于
<?xml version="1.0" encoding="ISO-8859-1"?><!DOCTYPE dblp SYSTEM "dblp.dtd"><dblp> <article mdate="2017-05-28" key="journals/acta/Saxena96"> <author>Sanjeev Saxena</author> <title>Parallel Integer Sorting and Simulation Amongst CRCW Models.</title> <pages>607-619</pages> <year>1996</year> <volume>33</volume> <journal>Acta Inf.</journal> <number>7</number> <url>db/journals/acta/acta33.html#Saxena96</url> <ee>https://doi.org/10.1007/BF03036466</ee> </article>......<dblp>
此处将实验数据处理的过程整理如下:
1.需求
DBLP现在的数据量太大了(截止到2018-07-12 22:38,2G+),根本用不了那么多,而且在XML中不方便处理。
现在的需求是:
根据需求将一定量的数据导入到数据库(此处以mysql为例)
2.处理
经分析不难想到,问题的解决分为两步:
1.按照需求将文件截取一部分(
有人也许会说,这都是废话,谁都知道,单独列为一步, 是因为一般的编辑器,还真打不开2G+的xml
)
2.xml导入数据库
2.1截取部分文件
截取文件,我推荐的软件是UltraEdit,至于如何破解,此处不做过多赘述
开始
结束
然后截取你需要的数据拷贝到新的文件里即可,需要注意的是:新文件
的末尾别忘了拷贝结束标签
</dblp>
新文件的末尾
2.2数据导入数据库
技术选型:
java的DM4J
+mysql
(1)数据库建表
/* Navicat Premium Data Transfer Source Server Type : MySQL Source Server Version : 50527 Source Host : localhost:3306 Source Schema : mypaper Target Server Type : MySQL Target Server Version : 50527 File Encoding : 65001 */SET NAMES utf8mb4;SET FOREIGN_KEY_CHECKS = 0;-- ------------------------------ Table structure for paper-- ----------------------------DROP TABLE IF EXISTS `paper`;CREATE TABLE `paper` ( `id` int(32) UNSIGNED ZEROFILL NOT NULL AUTO_INCREMENT, `author` varchar(1024) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, `title` varchar(1024) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, `mdate` varchar(512) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, `key` varchar(512) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, `pages` varchar(512) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, `year` varchar(64) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, `volume` varchar(512) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, `journal` varchar(512) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, `number` varchar(64) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, `url` varchar(512) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, `ee` varchar(512) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, PRIMARY KEY (`id`) USING BTREE ) ENGINE = InnoDB AUTO_INCREMENT = 243354 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Compact;SET FOREIGN_KEY_CHECKS = 1;
(2)java处理
实体类:
package org.stone6762.domain;/** * @ClassName:MyPaper * @author Stone6762 * @Description: */public class MyPaper { private String author; private String title; private String mdate; private String key; private String pages; private String year; private String volume; private String journal; private String number; private String url; private String ee; public String getAuthor() { return author; } public void setAuthor(String author) { this.author = author; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getMdate() { return mdate; } public void setMdate(String mdate) { this.mdate = mdate; } public String getKey() { return key; } public void setKey(String key) { this.key = key; } public String getPages() { return pages; } public void setPages(String pages) { this.pages = pages; } public String getYear() { return year; } public void setYear(String year) { this.year = year; } public String getVolume() { return volume; } public void setVolume(String volume) { this.volume = volume; } public String getJournal() { return journal; } public void setJournal(String journal) { this.journal = journal; } public String getNumber() { return number; } public void setNumber(String number) { this.number = number; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public String getEe() { return ee; } public void setEe(String ee) { this.ee = ee; } @Override public String toString() { return "MyPaper [author=" + author + ", title=" + title + ", mdate=" + mdate + ", key=" + key + ", pages=" + pages + ", year=" + year + ", volume=" + volume + ", journal=" + journal + ", number=" + number + ", url=" + url + ", ee=" + ee + "]"; } }
xml解析工具
package org.stone6762.utils;import java.io.File;import java.util.ArrayList;import java.util.Iterator;import java.util.List;import org.dom4j.Attribute;import org.dom4j.Document;import org.dom4j.DocumentException;import org.dom4j.Element;import org.dom4j.io.SAXReader;import org.stone6762.domain.MyPaper;/** * @Title:XML2Bean.java * @author Stone6762 * @Description: */public class XML2Bean { public static List<MyPaper> getPapersFromXML(File file) { List<MyPaper> paperList = null; MyPaper paper = null; SAXReader reader = new SAXReader(); try { Document document = reader.read(file); Element paperstore = document.getRootElement(); Iterator storeit = paperstore.elementIterator(); paperList = new ArrayList<MyPaper>(); while (storeit.hasNext()) { paper = new MyPaper(); Element paperElement = (Element) storeit.next(); // 遍历paperElement的属性 List<Attribute> attributes = paperElement.attributes(); for (Attribute attribute : attributes) { if (attribute.getName().equals("mdate")) { String mdate = attribute.getValue(); paper.setMdate(mdate); } if (attribute.getName().equals("key")) { String key = attribute.getValue(); paper.setKey(key); } } Iterator paperit = paperElement.elementIterator(); while (paperit.hasNext()) { Element child = (Element) paperit.next(); String nodeName = child.getName(); if (nodeName.equals("author")) { String author = child.getStringValue(); paper.setAuthor(author); } else if (nodeName.equals("title")) { String title = child.getStringValue(); paper.setTitle(title); } else if (nodeName.equals("pages")) { String pages = child.getStringValue(); paper.setPages(pages); } else if (nodeName.equals("year")) { String year = child.getStringValue(); paper.setYear(year); } else if (nodeName.equals("volume")) { String volume = child.getStringValue(); paper.setVolume(volume); } else if (nodeName.equals("journal")) { String journal = child.getStringValue(); paper.setJournal(journal); } else if (nodeName.equals("number")) { String number = child.getStringValue(); paper.setNumber(number); } else if (nodeName.equals("url")) { String url = child.getStringValue(); paper.setUrl(url); } else if (nodeName.equals("ee")) { String ee = child.getStringValue(); paper.setEe(ee); } } paperList.add(paper); paper = null; } } catch (DocumentException e) { e.printStackTrace(); } return paperList; } }
导入数据库
package org.stone6762.main;import java.io.File;import java.sql.Connection;import java.sql.DriverManager;import java.sql.PreparedStatement;import java.util.List;import org.stone6762.domain.MyPaper;import org.stone6762.utils.XML2Bean;/** * @ClassName:Main * @author Stone6762 * @Description: */public class Main { private static String USERNAME = "root"; private static String PASSWORD = "root"; private static String DRIVER = "com.mysql.jdbc.Driver"; private static String URL = "jdbc:mysql://localhost:3306/mypaper"; private static Connection con = null; private static PreparedStatement pstmt = null; public static void main(String[] args) throws Exception { Class.forName(DRIVER); System.out.println("注册驱动成功!"); con = DriverManager.getConnection(URL, USERNAME, PASSWORD); System.out.println("获取连接成功!"); String sql = "INSERT INTO paper (paper.author, " + "paper.ee," + "paper.journal," + "paper.key," + "paper.mdate," + "paper.number," + "paper.pages," + "paper.title," + "paper.url," + "paper.volume," + "paper.year )VALUES (?,?,?,?,?,?,?,?,?,?,? )"; pstmt = con.prepareStatement(sql); List<MyPaper> papers = XML2Bean.getPapersFromXML(new File("ref/DBCP_1_1.xml")); for (int i = 0; i < papers.size(); i++) { MyPaper myPaper = papers.get(i); pstmt.setObject(1, myPaper.getAuthor()); pstmt.setObject(2, myPaper.getEe()); pstmt.setObject(3, myPaper.getJournal()); pstmt.setObject(4, myPaper.getKey()); pstmt.setObject(5, myPaper.getMdate()); pstmt.setObject(6, myPaper.getNumber()); pstmt.setObject(7, myPaper.getPages()); pstmt.setObject(8, myPaper.getTitle()); pstmt.setObject(9, myPaper.getUrl()); pstmt.setObject(10, myPaper.getVolume()); pstmt.setObject(11, myPaper.getYear()); pstmt.executeUpdate(); System.out.println(i+"插入成功!"); } } }
3.结果展示
部分数据
单条数据
作者:新手村的0级玩家
链接:https://www.jianshu.com/p/e3b1afcc333a
点击查看更多内容
为 TA 点赞
评论
共同学习,写下你的评论
评论加载中...
作者其他优质文章
正在加载中
感谢您的支持,我会继续努力的~
扫码打赏,你说多少就多少
赞赏金额会直接到老师账户
支付方式
打开微信扫一扫,即可进行扫码打赏哦