数据挖掘决策树算法实现

上传人：伐*** IP属地：宁夏上传时间：2021-12-04 格式：DOCX 页数：9 大小：47.37KB 积分：10.8 举报 版权申诉

已阅读5页，还剩4页未读，继续免费阅读

版权说明：本文档由用户提供并上传，收益归属内容提供方，若内容存在侵权，请进行举报或认领

文档简介

1、实验数据挖掘决策树算法实现实验内容决策树算法是非常常用的分类算法，是逼近离散目标函数的方法，学习得到的函数以决策树的形式表示。其基木思路是不断选取产生信息增益最人的属性来划分样例集和，构造决策树。信息增益定义为结点与其子结点的信息爛之差。信息癇是香农提出的，用于描述信息不纯度 (不稳定性)，其计算公式是entropy (s)二 -z a log p,z = 1pi子集合中不同性(而二元分类即正样例和负样例)的样例的比例。这样信息收益nj以定义为样本按照某丿肉性划分吋造成嫡减少的期望，可以区分训练样本中正负样本的能力，其计算公式是：gain (s9a) = entropy (s) 工 -

2、'entropy (s、.)ver(j) i s i 7(a)是属件a的值域s足样本集介s、.是s屮在屈件a i值等jf的样木集介实现该算法针对的样例集仑如下outlook temperaturehumidinwindplavtennis1sunnyhothighweakno<-'sutmvhothighstrongno3overcasthothighweakves4rainyfmildhighweakv亡s3if5rainvcoolnomialweakves6rainvjcoolnomialstrongno a7overcastcoolnomialstrongy亡sa8

3、sunnymildhighweak门oq9sunnycoolnormalweakv亡s310rainvniildnomialweakyes111suimyjmildnomialweakvesa12overcastmildhighstrongy亡s313overcasthotnormalweakves<-*j14rainymildhighstrongnoq该农记录了在不同气候条件下是否公打球的悄况，要求根据该表用程序输出决策树二算法实现c+代码如下：#include <iostrcam> #include <string>#include <vector>

4、;#include <map>#include <algorithm>#include <cmath> using namespace std;#define maxlen 6输入每行的数据个数vector <vector <string> > state; vector <string> item(maxlen); vector <string> attribute_row; string end(”end”);/输入结束 string yes(nyesn);string no(nnon);string bl

5、ank(“”)；map<string,vector < string > > map. int tree_size = 0;struct nodestring attribute;string arrived_value; vector<node *> childs;node()attribute = blank; arrived_value = blank;node * root;实例集对应一行实例集保存首行即属性行数据attributevalues;/存储属性对应的所冇的值决策树节点属性值到达的属性值所有的孩了根据数据实例计算属性与值组成的mapvoid

6、 computemapfrom2dvector() unsigned int i,jjk;bool exited = false;vector<string> values;for(i = 1;i< maxleng-1; i+)按照列遍历for (j = 1; j < statc.sizc(); j+)for (k = 0; k < values.size(); k+) if(!pare(statejlli) exited = true;if(!exited)values.push_back(stateji);exited = false;m

7、ap_attribute_valuesstateoi = values; values.erase(values.begin(), values.end();根据具体屈性和值來计算爛double computeentropy(vector <vector <string> > remain_state, string attribute, string value,bool ifparent)vcctor<int> count (2,0);unsigned int ij;bool done_flag = false;for(j = l;j< maxle

8、n; j+)if(done_flag) break;if(! attributc_row j .comparc(attributc) for(i = 1; i < remain_state.size(); i+)if(!ifparent&&!remain_pare(value) ii ifparent)/ifparent 记录是否算父节点if(!remain_stateilmaxlen - pare(yes)count0+;else countll j+；donc_flag = true;)if(count0 = 0 ii count 1

9、= 0 ) return 0;double sum = count()l + countfl;doubleentropy=-count0/sum*log(count0/sum)/log(2.0)-countl /sum*log(count 1 /sum)/log(2.0);return entropy;计算按照属性attribute划分当前剩余实例的信息增益double computegain(vector <vector <string> > remain_state, string attribute)!unsigned int j,k,m;double paren

10、t_entropy = computeentropy(remain_state, attribute, blank, true);double children_entropy = 0;vcctor<string> values = map_attributc_valucsattributc;vector<double> ratio;vector<int> count_values;int tcmpint;for(m = 0; m < values.size(); m+)tempint = 0;for(k = l;k< maxlen 1; k+)

11、if(! attribute_row kj .compare(attribute) for(j = 1; j < remain_state.size(); j+)if(!remain_pare(valuesm) tempint+;) count_values.push_back(tempint);)for(j = 0; j < values.size(); j+)ratio.push_back(doublc)count_valucsj / (doublc)(rcmain_statc.sizc()-1); double temp_entropy;for(j =

12、0; j < values.size(); j+)temp_entropy = computeentropy(remain_state, attribute, valuesjl, false); childrcn_cntropy += ratioj * tcmp_cntropy;return (parent_entropy - children_entropy);int findattrinumbynamc(string attri)for(int i = 0; i < maxlen; i+)if(!pare(attri) return i;cerr«

13、;mcan,t find the numth of attributeu«endl;return 0;找出样例屮占多数的正/负性string mostcommonlabel(vector <vector <string> > remain_state) int p = 0, n = 0;for(unsigned i = 0; i < remain_state.size(); i+)if(!remain_stateijmaxlen-pare(yes) p+; else n+;if(p >= n) return yes;else return no;判

14、断样例是否正负性都为labelbool allthesamelabel(vector <vector <string> > remain_state, string label) int count = 0;for(unsigned int i = 0; i < remain_state.size(); i+) if(!remain_stateilmaxlen-pare(label) count+;if(count = remain_state.size()-1) return true;else return false;node * buliddcc

15、isiontrccdfs(nodc * p, vector <vcctor <string> > rcmain_statc, vector <string> remain_attribute) /if(remain_state.size() > 0)/pri n t v( re mai n_state);/if (p = null)p = new node();if (allthesamelabel(remain_state, yes)p->attribute = yes;return p;if (ahthesamelabel(remain_st

16、ate5 no) p->attribute = no;return p;if(rcmain_attributc.sizc() = 0)string label = mostcommonlabcl(rcmain_statc);p->attribute = label;return p;double max_gain = 0, tcmp_gain;vector <string>:iterator max_it = remain_attribute.begin();vector <string>:iterator itl;for(itl = remain_attr

17、ibute.begin(); itl < remain_attribute.end(); itl+) temp_gain = computegain(remain_state, (*itl);if(tcmp_gain > max_gain) max_gain = temp_gain;max_it = itl;下面根据max"指向的属性来划分当前样例，更浙样例集和属性集 vector <string> new_attribute;vector <vector <string> > new_state;for(vector <stri

18、ng>:itcrator it2 = rcmain_attributc.bcgin(); it2 < rcmain_d(); it2+)if(*it2).compare(*max_it) new_attribute.push_back(*it2);)p >auribute = *max_it;vector <string> values = map_attribute_values*max_il;int attribuc_num = findattrinumbynamc(*maxt); new_state.push_back(attribute_row);for(

19、vector <string>:iterator it3 = values.begin(); it3 < values.end(); it3+) for(unsigned int i = 1; i < remain_state.size(); i+) if(!remain_stateiattribue_pare(*it3) ncw_statc.push_back(rcmain_statci);node * new_node = new node();new node->arrived value = *it3;if(ncw_statc.sizc()

20、= 0)ncw_nodc->attributc=mostcommonlabel(remain_suite);elsebuliddecisiontreedfs(new_node, new_state, new_attribute); p->childs.push_back(ncw_nodc);new_state.erase(new_stcite.begin()+1 ,new_state.end();/return p;void input() string s; while(cin»s,pare(end) != 0)/-l 为输入结束 item0 = s;for(int i

21、 = 1 ;i < maxlen; i+) cin»itcmi;state. pu sh_back(item); for(int j = 0;j< maxlen; j+)attribute_row.push_back(state0j);void printtree(node *p, int depth)for (int i = 0; i < depth; i+) cout« v;按照树的深度先输出 tabif(!p->arrivcd_valuc.cmpty()cout«p->arrived_value«endl;for (i

22、nt i = 0; i < depth+l; i+) cout« 't' 按照树的深度先输出 tabcout«p->attribute«endl;for (vector<node*>:iterator it = p->childs.begin(); it != p->childs.end(); it+) printtree(*it, depth + 1);void freetree(node *p)讦(p = null)return;for (vector<nodeitendor it = p->ch

23、ilds.begin(); it != p->childs.end(); it+) frcctrcc(*it);delete p;tree_size+;int main()input();vector <string> remain_attribute;string outlook(houtlooklf);string temperature(htemperatureh);string humidity("humiditym);string wind("wind,r);remain_attribute.push_back(outlook); remain_attribute.push_back(temperature);remain_attribute.push_back(humidity); remain_attribute.push_back(wind

人人文库> 全部分类> 生活休闲 > 科普知识

温馨提示

1. 本站所有资源如无特殊说明，都需要本地电脑安装OFFICE2007和PDF阅读器。图纸软件为CAD,CAXA,PROE,UG,SolidWorks等.压缩文件请下载最新的WinRAR软件解压。
2. 本站的文档不包含任何第三方提供的附件图纸等，如果需要附件，请联系上传者。文件的所有权益归上传用户所有。
3. 本站RAR压缩包中若带图纸，网页内容里面会有图纸预览，若没有图纸预览就没有图纸。
4. 未经权益所有人同意不得将文件中的内容挪作商业或盈利用途。
5. 人人文库网仅提供信息存储空间，仅对用户上传内容的表现方式做保护处理，对用户上传分享的文档内容本身不做任何修改或编辑，并不能对任何下载内容负责。
6. 下载文件中如有侵权或不适当内容，请与我们联系，我们立即纠正。
7. 本站不保证下载资源的准确性、安全性和完整性, 同时也不承担用户因使用这些下载资源对自己和他人造成任何形式的伤害或损失。

数据挖掘决策树算法实现

文档简介

温馨提示

最新文档

评论

数据挖掘决策树算法实现

文档简介

温馨提示

最新文档

评论

相关文档