正文之前
事情是这样的,我前面说过了。。。。就是我的毕业论文字数写到14200的时候就感觉有点写不动了,虽然还有性能度量和致谢和一大批的文献参考没写,但是我总感觉这样不妥,所以就特地的又加了点东西。在后剪枝方法和连续值离散化之间,我选择了离散化这个相对好点的东西。后剪枝感觉没什么好补充的。。
超喜欢的长腿跳舞小姐姐
正文
从不废话,先放代码!
/* *********************
* Author : HustWolf --- 张照博
* Time : 2018.1-2018.5
* Address : HUST
* Version : 1.0
* 定义一些静态的数值,并且提供getter
********************* */import java.text.NumberFormat;import java.util.*;class Alone_Value_Category implements Comparable<Alone_Value_Category>{ private float sensor; private float category; // private float[] range = new float[2];
Alone_Value_Category(float a, float b){ super(); this.sensor = a; this.category = b;
} float getSensor(){ return sensor;
} float getCategory(){ return category;
} // void setRange(float a, float b){// range[0] = a;// range[1] =b;// }
@Override
public String toString() { return "\n[Sensor:" + sensor + ", category=" + category + "]";
} @Override
public int compareTo(Alone_Value_Category o) { return Float.compare(this.sensor,o.sensor);
}
}上面这个是定义的一个存储数据的地方,这个类用来分割数据,做到单属性对分类的格式。一条4 Sensor 1Category 一共会被拆解为4个这种类的实例分别参与EADC离散化的过程。
class Interval{ private float top; private float bottom; public Map<Float,List<Alone_Value_Category> > sample = new HashMap<Float, List<Alone_Value_Category>>();
Interval(){};
Interval(Interval b){
top = b.top;
bottom = b.bottom;
sample = b.sample;
}
Interval(float a, float b, float c, List<Alone_Value_Category> d){ this.top = a; this.bottom = b;
sample.put(c,d);
} public float getTop() { return top;
} public float getBottom() { return bottom;
} public void setTop(float top) { this.top = top;
} public void setBottom(float bottom) { this.bottom = bottom;
} public void setSample(Map<Float, List<Alone_Value_Category>> sample) { this.sample = sample;
} public Interval addTmp(Interval b){
Interval re = new Interval(b); if (top>b.top) re.setTop(top); else re.setTop(b.top); if (bottom<b.bottom) re.setBottom(bottom); else re.setBottom(b.bottom);
re.sample.putAll(sample); return re;
} public void merge(Interval b){ if (top<b.top)
top = b.top; if (bottom>b.bottom)
bottom = b.bottom;
sample.putAll(b.sample);
} public int getCount(){ int count = 0; for(List<Alone_Value_Category> s:sample.values()){
count+=s.size();
} return count;
} @Override
public String toString() { return "bottom:"+bottom+" top:"+top+" size:"+getCount();
}
}区间类,每一个区间有上界,下界,还有对应的Alone_Value_Category集合。不过这里面的集合是按照类别-->List的模式存储。按照我的数据,应该是每一个Interval都有两个List
public class Parameter { private static int rate = 2; private static int trainNum = 40000; private static int testNum = trainNum/rate; public static int getTrainNum(){ return trainNum;
} public static int getRate(){ return rate;
} public static int getTestNum(){ return testNum;
} public static int getTestDistance(){ return 2000000/testNum;
} public static int getTrainDistance(){ return 2000000/trainNum;
} public static void setRate(int r){
rate = r;
testNum = trainNum / rate;
}
public static void setTrainNum(int t){
trainNum = t;
testNum = trainNum / rate;
} public static void setTestNum(int t){
testNum = t;
trainNum = testNum * rate;
} public static void Clear(ArrayList<Interval> allInterval){
ArrayList<Interval> del = new ArrayList<>(); for (int s = 0;s<allInterval.size();++s) { if (allInterval.get(s).getCount() == 0){ if (s>0) {
allInterval.get(s - 1).merge(allInterval.get(s));
del.add(allInterval.get(s));
} continue;
}
}
allInterval.removeAll(del);
} static double Entropy(ArrayList<Interval> set, int size){ double shang = 0;
NumberFormat nf = NumberFormat.getNumberInstance();
nf.setMaximumFractionDigits(4); for (Interval x:set){ double p =(double)x.getCount()/(double)size;
shang -= p*(Math.log(p)/Math.log(2));
} return Double.parseDouble(nf.format(shang));
} public static ArrayList<List<Float>> EADC(float[][] dat) {
ArrayList<List<Float>> re = new ArrayList<>(); for (int valueindex = 0; valueindex< dat[0].length-1;++valueindex) {
ArrayList<Alone_Value_Category> LIST = new ArrayList<>(); for (int i = 0; i < dat.length; ++i) {
LIST.add(new Alone_Value_Category(dat[i][valueindex], dat[i][dat[valueindex].length - 1])); //便利旧集合没有就添加到新集合
}
Collections.sort(LIST); float len = LIST.get(LIST.size() - 1).getSensor() - LIST.get(0).getSensor(); int k = 40; float gap = (len + 1) / k; float Lowest = LIST.get(0).getSensor() - 0.50f; float Highest = LIST.get(LIST.size()-1).getSensor() + 0.50f;
NumberFormat nf = NumberFormat.getNumberInstance();
nf.setMaximumFractionDigits(1);
List<Float> range = new LinkedList<>(); for (int x = 0; x <= k; ++x) {
range.add(Float.parseFloat(nf.format(Lowest + x * gap)));
}
ArrayList<Interval> allInterval = new ArrayList<>(); for (int i = 0; i < k; ++i) {
Interval newarea = new Interval();
newarea.setBottom(range.get(i));
newarea.setTop(range.get(i + 1)); for (Alone_Value_Category s : LIST) { if (s.getSensor() > range.get(i) && s.getSensor() < range.get(i + 1)) { if (!newarea.sample.containsKey(s.getCategory())) {
newarea.sample.put(s.getCategory(), new LinkedList<>());
}
newarea.sample.get(s.getCategory()).add(s);
}
}
allInterval.add(newarea);
} int size = 0;
Clear(allInterval); for (Interval s : allInterval) {
size += s.getCount();
}
k = allInterval.size(); int k0 = k; double Ck0 = 0.5; boolean Loop = true; double Hpk_1 = 0; while (Loop && k >= 10) { double minD = 1000; int mergePoint = 0; double Hp0 = Entropy(allInterval, size); double Hpk;
ArrayList<Interval> newA = new ArrayList<>(); for (int i = 0; i < allInterval.size() - 1; ++i) {
newA.addAll(allInterval);
newA.get(i).merge(newA.get(i + 1));
newA.remove(i + 1);
Hpk = Entropy(newA, size); if (Hpk - Hp0 < minD) {
Hpk_1 = Hpk;
minD = Hpk - Hp0;
mergePoint = i;
}
newA.clear();
}
allInterval.get(mergePoint).merge(allInterval.get(mergePoint + 1));
allInterval.remove(allInterval.get(mergePoint + 1)); double Ck_1 = (k0 - 1) * Hpk_1 - Hp0 * (k - 2); if (Ck_1 > Ck0) {
--k;
} else {
Loop = false;
--k;
}// Ck = Ck_1;
}
range.clear();
range.add(-100f); for (Interval s:allInterval) {
range.add(s.getTop());
}
range.add(100f);
re.add(range);// long endTime=System.currentTimeMillis(); //获取结束时间// System.out.println("\n程序运行时间: "+(endTime-startTime)+"ms");
} return re;
}
}主体类,也是EADC算法的(一种基于熵的连续属性离散化算法)的Java实现!我是三天晒网,一天打渔,不过终于今天还是肝出来了。。这就意味着差不多要收工了!美滋滋Q!!!
具体来说其实还好吧。。。等后面毕业了我把我的毕业论文写成简书发出来,大家伙就看的明白了咯!现在先上数学表达!
最后得到的伪代码就是下面的了:
当然,他这个有点看不明白,看我的解释吧!
整个离散化的过程如下:
(1) 从数据库读取数据,传入到离散化方法中;
(2) 先针对单一的属性,取出所有的值,并且对其进行排序;
(3) 排序后划分区间,并且利用熵的计算公式计算出初始熵,设置度量数值Ck = 0 ;
(4) 合并两个相邻区间,使合并前后的熵差最小,并且重置划分点,保存合并后的熵值;
(5) 根据上面的度量公式计算出Ck-1 = h;
(6) 如果Ck-1 > Ck ,那么k = k -1,回到第(4)步;
(7) 如果Ck-1 < Ck ,保存当前的区间划分,结束区间划分进程;
(8) 将传入的数据根据当前区间划分进行离散化。
离散化流程图如下:
上面这图花了好久。才算是理清了。。。不容易啊不容易!!
正文之后
争取今晚写完论文,明天排版完毕,最好事明天先自查,然后大后天上知网查重。。。大大后天,要给某人一个惊喜,就是不知道她能不能看到了!!
作者:HustWolf
链接:https://www.jianshu.com/p/4d7c11fe3644
共同学习,写下你的评论
评论加载中...
作者其他优质文章






