版權(quán)說(shuō)明:本文檔由用戶提供并上傳,收益歸屬內(nèi)容提供方,若內(nèi)容存在侵權(quán),請(qǐng)進(jìn)行舉報(bào)或認(rèn)領(lǐng)
文檔簡(jiǎn)介
1、實(shí)驗(yàn)數(shù)據(jù)挖掘決策樹(shù)算法實(shí)現(xiàn)實(shí)驗(yàn)內(nèi)容決策樹(shù)算法是非常常用的分類(lèi)算法,是逼近離散目標(biāo)函數(shù)的方法,學(xué)習(xí)得到的函數(shù)以決策樹(shù) 的形式表示。其基木思路是不斷選取產(chǎn)生信息增益最人的屬性來(lái)劃分樣例集和,構(gòu)造決策樹(shù)。 信息增益定義為結(jié)點(diǎn)與其子結(jié)點(diǎn)的信息爛之差。信息癇是香農(nóng)提出的,用于描述信息不純度 (不穩(wěn)定性),其計(jì)算公式是entropy (s)二 -z a log p,z = 1pi子集合中不同性(而二元分類(lèi)即正樣例和負(fù)樣例)的樣例的比例。這樣信息收益nj以定義為 樣本按照某丿肉性劃分吋造成嫡減少的期望,可以區(qū)分訓(xùn)練樣本中正負(fù)樣本的能力,其計(jì)算公 式是:gain (s9a) = entropy (s) 工 -
2、'entropy (s、.)ver(j) i s i 7(a)是屬件a的值域s足樣本集介s、.是s屮在屈件a i值等jf的樣木集介實(shí)現(xiàn)該算法針對(duì)的樣例集侖如下outlook temperaturehumidinwindplavtennis1sunnyhothighweakno<-'sutmvhothighstrongno3overcasthothighweakves4rainyfmildhighweakv亡s3if5rainvcoolnomialweakves6rainvjcoolnomialstrongno a7overcastcoolnomialstrongy亡sa8
3、sunnymildhighweak門(mén)oq9sunnycoolnormalweakv亡s310rainvniildnomialweakyes111suimyjmildnomialweakvesa12overcastmildhighstrongy亡s313overcasthotnormalweakves<-*j14rainymildhighstrongnoq該農(nóng)記錄了在不同氣候條件下是否公打球的悄況,要求根據(jù)該表用程序輸出決策樹(shù)二算法實(shí)現(xiàn)c+代碼如下:#include <iostrcam> #include <string>#include <vector>
4、;#include <map>#include <algorithm>#include <cmath> using namespace std;#define maxlen 6輸入每行的數(shù)據(jù)個(gè)數(shù)vector <vector <string> > state; vector <string> item(maxlen); vector <string> attribute_row; string end(”end”);/輸入結(jié)束 string yes(nyesn);string no(nnon);string bl
5、ank(“”);map<string,vector < string > > map. int tree_size = 0;struct nodestring attribute;string arrived_value; vector<node *> childs;node()attribute = blank; arrived_value = blank;node * root;實(shí)例集對(duì)應(yīng)一行實(shí)例集保存首行即屬性行數(shù)據(jù)attributevalues;/存儲(chǔ)屬性對(duì)應(yīng)的所冇的值決策樹(shù)節(jié)點(diǎn)屬性值到達(dá)的屬性值所有的孩了根據(jù)數(shù)據(jù)實(shí)例計(jì)算屬性與值組成的mapvoid
6、 computemapfrom2dvector() unsigned int i,jjk;bool exited = false;vector<string> values;for(i = 1;i< maxleng-1; i+)按照列遍歷for (j = 1; j < statc.sizc(); j+)for (k = 0; k < values.size(); k+) if(!pare(statejlli) exited = true;if(!exited)values.push_back(stateji);exited = false;m
7、ap_attribute_valuesstateoi = values; values.erase(values.begin(), values.end();根據(jù)具體屈性和值來(lái)計(jì)算爛double computeentropy(vector <vector <string> > remain_state, string attribute, string value,bool ifparent)vcctor<int> count (2,0);unsigned int ij;bool done_flag = false;for(j = l;j< maxle
8、n; j+)if(done_flag) break;if(! attributc_row j .comparc(attributc) for(i = 1; i < remain_state.size(); i+)if(!ifparent&&!remain_pare(value) ii ifparent)/ifparent 記錄是否算父節(jié)點(diǎn)if(!remain_stateilmaxlen - pare(yes)count0+;else countll j+;donc_flag = true;)if(count0 = 0 ii count 1
9、= 0 ) return 0;double sum = count()l + countfl;doubleentropy=-count0/sum*log(count0/sum)/log(2.0)-countl /sum*log(count 1 /sum)/log(2.0);return entropy;計(jì)算按照屬性attribute劃分當(dāng)前剩余實(shí)例的信息增益double computegain(vector <vector <string> > remain_state, string attribute)!unsigned int j,k,m;double paren
10、t_entropy = computeentropy(remain_state, attribute, blank, true);double children_entropy = 0;vcctor<string> values = map_attributc_valucsattributc;vector<double> ratio;vector<int> count_values;int tcmpint;for(m = 0; m < values.size(); m+)tempint = 0;for(k = l;k< maxlen 1; k+)
11、if(! attribute_row kj .compare(attribute) for(j = 1; j < remain_state.size(); j+)if(!remain_pare(valuesm) tempint+;) count_values.push_back(tempint);)for(j = 0; j < values.size(); j+)ratio.push_back(doublc)count_valucsj / (doublc)(rcmain_statc.sizc()-1); double temp_entropy;for(j =
12、0; j < values.size(); j+)temp_entropy = computeentropy(remain_state, attribute, valuesjl, false); childrcn_cntropy += ratioj * tcmp_cntropy;return (parent_entropy - children_entropy);int findattrinumbynamc(string attri)for(int i = 0; i < maxlen; i+)if(!pare(attri) return i;cerr«
13、;mcan,t find the numth of attributeu«endl;return 0;找出樣例屮占多數(shù)的正/負(fù)性string mostcommonlabel(vector <vector <string> > remain_state) int p = 0, n = 0;for(unsigned i = 0; i < remain_state.size(); i+)if(!remain_stateijmaxlen-pare(yes) p+; else n+;if(p >= n) return yes;else return no;判
14、斷樣例是否正負(fù)性都為labelbool allthesamelabel(vector <vector <string> > remain_state, string label) int count = 0;for(unsigned int i = 0; i < remain_state.size(); i+) if(!remain_stateilmaxlen-pare(label) count+;if(count = remain_state.size()-1) return true;else return false;node * buliddcc
15、isiontrccdfs(nodc * p, vector <vcctor <string> > rcmain_statc, vector <string> remain_attribute) /if(remain_state.size() > 0)/pri n t v( re mai n_state);/if (p = null)p = new node();if (allthesamelabel(remain_state, yes)p->attribute = yes;return p;if (ahthesamelabel(remain_st
16、ate5 no) p->attribute = no;return p;if(rcmain_attributc.sizc() = 0)string label = mostcommonlabcl(rcmain_statc);p->attribute = label;return p;double max_gain = 0, tcmp_gain;vector <string>:iterator max_it = remain_attribute.begin();vector <string>:iterator itl;for(itl = remain_attr
17、ibute.begin(); itl < remain_attribute.end(); itl+) temp_gain = computegain(remain_state, (*itl);if(tcmp_gain > max_gain) max_gain = temp_gain;max_it = itl;下面根據(jù)max"指向的屬性來(lái)劃分當(dāng)前樣例,更浙樣例集和屬性集 vector <string> new_attribute;vector <vector <string> > new_state;for(vector <stri
18、ng>:itcrator it2 = rcmain_attributc.bcgin(); it2 < rcmain_d(); it2+)if(*it2).compare(*max_it) new_attribute.push_back(*it2);)p >auribute = *max_it;vector <string> values = map_attribute_values*max_il;int attribuc_num = findattrinumbynamc(*maxt); new_state.push_back(attribute_row);for(
19、vector <string>:iterator it3 = values.begin(); it3 < values.end(); it3+) for(unsigned int i = 1; i < remain_state.size(); i+) if(!remain_stateiattribue_pare(*it3) ncw_statc.push_back(rcmain_statci);node * new_node = new node();new node->arrived value = *it3;if(ncw_statc.sizc()
20、= 0)ncw_nodc->attributc=mostcommonlabel(remain_suite);elsebuliddecisiontreedfs(new_node, new_state, new_attribute); p->childs.push_back(ncw_nodc);new_state.erase(new_stcite.begin()+1 ,new_state.end();/return p;void input() string s; while(cin»s,pare(end) != 0)/-l 為輸入結(jié)束 item0 = s;for(int i
21、 = 1 ;i < maxlen; i+) cin»itcmi;state. pu sh_back(item); for(int j = 0;j< maxlen; j+)attribute_row.push_back(state0j);void printtree(node *p, int depth)for (int i = 0; i < depth; i+) cout« v;按照樹(shù)的深度先輸出 tabif(!p->arrivcd_valuc.cmpty()cout«p->arrived_value«endl;for (i
22、nt i = 0; i < depth+l; i+) cout« 't' 按照樹(shù)的深度先輸出 tabcout«p->attribute«endl;for (vector<node*>:iterator it = p->childs.begin(); it != p->childs.end(); it+) printtree(*it, depth + 1);void freetree(node *p)訐(p = null)return;for (vector<nodeitendor it = p->ch
23、ilds.begin(); it != p->childs.end(); it+) frcctrcc(*it);delete p;tree_size+;int main()input();vector <string> remain_attribute;string outlook(houtlooklf);string temperature(htemperatureh);string humidity("humiditym);string wind("wind,r);remain_attribute.push_back(outlook); remain_attribute.push_back(temperature);remain_attribute.push_back(humidity); remain_attribute.push_back(wind
溫馨提示
- 1. 本站所有資源如無(wú)特殊說(shuō)明,都需要本地電腦安裝OFFICE2007和PDF閱讀器。圖紙軟件為CAD,CAXA,PROE,UG,SolidWorks等.壓縮文件請(qǐng)下載最新的WinRAR軟件解壓。
- 2. 本站的文檔不包含任何第三方提供的附件圖紙等,如果需要附件,請(qǐng)聯(lián)系上傳者。文件的所有權(quán)益歸上傳用戶所有。
- 3. 本站RAR壓縮包中若帶圖紙,網(wǎng)頁(yè)內(nèi)容里面會(huì)有圖紙預(yù)覽,若沒(méi)有圖紙預(yù)覽就沒(méi)有圖紙。
- 4. 未經(jīng)權(quán)益所有人同意不得將文件中的內(nèi)容挪作商業(yè)或盈利用途。
- 5. 人人文庫(kù)網(wǎng)僅提供信息存儲(chǔ)空間,僅對(duì)用戶上傳內(nèi)容的表現(xiàn)方式做保護(hù)處理,對(duì)用戶上傳分享的文檔內(nèi)容本身不做任何修改或編輯,并不能對(duì)任何下載內(nèi)容負(fù)責(zé)。
- 6. 下載文件中如有侵權(quán)或不適當(dāng)內(nèi)容,請(qǐng)與我們聯(lián)系,我們立即糾正。
- 7. 本站不保證下載資源的準(zhǔn)確性、安全性和完整性, 同時(shí)也不承擔(dān)用戶因使用這些下載資源對(duì)自己和他人造成任何形式的傷害或損失。
最新文檔
- 土地使用權(quán)出租協(xié)議
- 工業(yè)統(tǒng)計(jì)工作總結(jié)
- 卷?yè)P(yáng)工安全環(huán)保職業(yè)衛(wèi)生職責(zé)(4篇)
- 運(yùn)輸區(qū)副隊(duì)長(zhǎng)安全生產(chǎn)責(zé)任制(2篇)
- 2025年骨干教師個(gè)人年度工作計(jì)劃
- 淘寶美工工作總結(jié)
- 變電站知識(shí)培訓(xùn)課件
- 農(nóng)村環(huán)保知識(shí)培訓(xùn)課件
- 2025年浙江省安全員A證考試題庫(kù)附答案
- 貴州城市職業(yè)學(xué)院《社會(huì)查與研究方法》2023-2024學(xué)年第一學(xué)期期末試卷
- 網(wǎng)絡(luò)賭博、網(wǎng)絡(luò)借貸和網(wǎng)絡(luò)詐騙的危害
- 《中西醫(yī)的區(qū)別》課件
- RFID電子標(biāo)簽制作方法
- 智能制造企業(yè)數(shù)字化轉(zhuǎn)型建設(shè)方案
- 病理生理學(xué)課件脂代謝紊亂
- 教師幽默朗誦節(jié)目《我愛(ài)上班》
- 《細(xì)胞工程學(xué)》考試復(fù)習(xí)題庫(kù)(帶答案)
- 中學(xué)課堂教學(xué)評(píng)價(jià)量表
- 食堂食材配送以及售后服務(wù)方案
- 塊單項(xiàng)活動(dòng)教學(xué)材料教案丹霞地貌
- 青年人應(yīng)該如何樹(shù)立正確的人生觀
評(píng)論
0/150
提交評(píng)論