85 #define DELETE_ARRAY(p) if( p ) { delete[]p; p = NULL; }
88 enum ESG_mRMR_Selection
90 SG_mRMR_SELECTION_RANK = 0,
91 SG_mRMR_SELECTION_INDEX,
92 SG_mRMR_SELECTION_NAME,
93 SG_mRMR_SELECTION_SCORE
102 CSG_mRMR::CSG_mRMR(
void)
107 m_bDiscretized =
false;
119 CSG_mRMR::~CSG_mRMR(
void)
123 delete(m_pSelection);
127 void CSG_mRMR::Destroy(
void)
131 DELETE_ARRAY(m_Samples[0]);
132 DELETE_ARRAY(m_Samples);
138 m_bDiscretized =
false;
140 m_pSelection ->Del_Records();
152 "The minimum Redundancy Maximum Relevance (mRMR) feature selection "
153 "algorithm has been developed by Hanchuan Peng <hanchuan.peng@gmail.com>.\n"
156 "Feature selection based on mutual information: criteria of max-dependency, max-relevance, and min-redundancy. "
157 "Hanchuan Peng, Fuhui Long, and Chris Ding, "
158 "IEEE Transactions on Pattern Analysis and Machine Intelligence, "
159 "Vol. 27, No. 8, pp.1226-1238, 2005.\n"
161 "Minimum redundancy feature selection from microarray gene expression data,\n"
162 "Chris Ding, and Hanchuan Peng, "
163 "Journal of Bioinformatics and Computational Biology, "
164 "Vol. 3, No. 2, pp.185-205, 2005.\n"
166 "Hanchuan Peng's mRMR Homepage at "
167 "<a target=\"_blank\" href=\"http://penglab.janelia.org/proj/mRMR/\">"
168 "http://penglab.janelia.org/proj/mRMR/</a>\n"
183 ParentID,
"mRMR_NFEATURES" ,
_TL(
"Number of Features"),
189 ParentID,
"mRMR_DISCRETIZE" ,
_TL(
"Discretization"),
190 _TL(
"uncheck this means no discretizaton (i.e. data is already integer)"),
195 ParentID,
"mRMR_THRESHOLD" ,
_TL(
"Discretization Threshold"),
196 _TL(
"a double number of the discretization threshold; set to 0 to make binarization"),
201 ParentID,
"mRMR_METHOD" ,
_TL(
"Selection Method"),
204 _TL(
"Mutual Information Difference (MID)"),
205 _TL(
"Mutual Information Quotient (MIQ)")
226 bool bDiscretize = (*pParameters)(
"mRMR_DISCRETIZE") ? (*pParameters)(
"mRMR_DISCRETIZE")->asBool () :
true;
227 double Threshold = (*pParameters)(
"mRMR_THRESHOLD" ) ? (*pParameters)(
"mRMR_THRESHOLD" )->asDouble() : 1.0;
229 return( Set_Data(Data, ClassField, bDiscretize ? Threshold : -1.0) );
234 bool bDiscretize = (*pParameters)(
"mRMR_DISCRETIZE") ? (*pParameters)(
"mRMR_DISCRETIZE")->asBool () :
true;
235 double Threshold = (*pParameters)(
"mRMR_THRESHOLD" ) ? (*pParameters)(
"mRMR_THRESHOLD" )->asDouble() : 1.0;
237 return( Set_Data(Data, ClassField, bDiscretize ? Threshold : -1.0) );
243 int nFeatures = (*pParameters)(
"mRMR_NFEATURES") ? (*pParameters)(
"mRMR_NFEATURES")->asInt() : 50;
244 int Method = (*pParameters)(
"mRMR_METHOD" ) ? (*pParameters)(
"mRMR_METHOD" )->asInt() : 0;
246 return( Get_Selection(nFeatures, Method) );
255 int CSG_mRMR::Get_Count (
void)
const
257 return( (
int)m_pSelection->Get_Count() );
260 int CSG_mRMR::Get_Index(
int i)
const
262 return( m_pSelection->Get_Record(i)->asInt (SG_mRMR_SELECTION_INDEX) );
267 return( m_pSelection->Get_Record(i)->asString(SG_mRMR_SELECTION_NAME ) );
270 double CSG_mRMR::Get_Score(
int i)
const
272 return( m_pSelection->Get_Record(i)->asDouble(SG_mRMR_SELECTION_SCORE) );
281 #define ADD_MESSAGE(Message) if( m_bVerbose ) SG_UI_Msg_Add_Execution(CSG_String(Message) + "\n", false);
282 #define ADD_WARNING(Message) if( m_bVerbose ) SG_UI_Msg_Add_Execution(_TL("Warning") + CSG_String(": ") + CSG_String(Message) + "\n", false, SG_UI_MSG_STYLE_FAILURE);
290 bool CSG_mRMR::Get_Memory(
int nVars,
int nSamples)
304 m_nSamples = nSamples;
306 if( m_nSamples <= 0 )
314 m_Samples =
new double *[m_nSamples];
323 m_Samples[0] =
new double[m_nSamples * m_nVars];
336 bool CSG_mRMR::Set_Data(
CSG_Table &Data,
int ClassField,
double Threshold)
344 if( ClassField < 0 || ClassField >= m_nVars )
353 for(
int iSample=0, Class=0; iSample<m_nSamples; iSample++)
355 double *pData = m_Samples[iSample] = m_Samples[0] + iSample * m_nVars;
357 if( s.
Cmp(Data[iSample].asString(ClassField)) )
359 s = Data[iSample].asString(ClassField);
366 for(
int iVar=0; iVar<m_nVars; iVar++)
368 if( iVar != ClassField )
370 *pData++ = Data[iSample].asDouble(iVar);
379 for(
int iVar=0; iVar<m_nVars; iVar++)
381 if( iVar != ClassField )
388 if( Threshold >= 0.0 )
390 Discretize(Threshold);
397 bool CSG_mRMR::Set_Data(
CSG_Matrix &Data,
int ClassField,
double Threshold)
405 if( ClassField < 0 || ClassField >= m_nVars )
410 for(
int iSample=0; iSample<m_nSamples; iSample++)
412 double *pData = m_Samples[iSample] = m_Samples[0] + iSample * m_nVars;
414 *pData++ = Data[iSample][ClassField];
416 for(
int iVar=0; iVar<m_nVars; iVar++)
418 if( iVar != ClassField )
420 *pData++ = Data[iSample][iVar];
425 m_VarNames +=
"CLASS";
427 for(
int iVar=0; iVar<m_nVars; iVar++)
429 if( iVar != ClassField )
436 if( Threshold >= 0.0 )
438 Discretize(Threshold);
450 bool CSG_mRMR::Discretize(
double Threshold)
452 if( !m_Samples[0] || Threshold < 0.0 || m_bDiscretized )
460 for(j=1; j<m_nVars; j++)
466 for(i=0; i<m_nSamples; i++)
468 cursum += m_Samples[i][j];
471 curmean = cursum / m_nSamples;
476 for(i=0; i<m_nSamples; i++)
478 tmpf = m_Samples[i][j] - curmean;
479 cursum += tmpf * tmpf;
482 curstd = (m_nSamples == 1) ? 0 : sqrt(cursum / (m_nSamples - 1));
484 for(i=0; i<m_nSamples; i++)
486 m_Samples[i][j] = (m_Samples[i][j] - curmean) / curstd;
491 for(j=1; j<m_nVars; j++)
495 for(i=0; i<m_nSamples; i++)
497 tmpf = m_Samples[i][j];
499 if( tmpf > Threshold )
503 else if( tmpf < -Threshold )
512 m_Samples[i][j] = tmpf;
516 m_bDiscretized =
true;
527 #define ADD_FEATURE(rank, score) {\
528 CSG_Table_Record *pFeature = m_pSelection->Add_Record();\
530 pFeature->Set_Value(SG_mRMR_SELECTION_RANK , rank + 1);\
531 pFeature->Set_Value(SG_mRMR_SELECTION_INDEX, feaInd[rank]);\
532 pFeature->Set_Value(SG_mRMR_SELECTION_NAME , m_VarNames[(size_t)feaInd[rank]]);\
533 pFeature->Set_Value(SG_mRMR_SELECTION_SCORE, score);\
535 ADD_MESSAGE(CSG_String::Format(SG_T("%d \t %d \t %s \t %5.3f"),\
536 rank + 1, feaInd[rank], m_VarNames[(size_t)feaInd[rank]].c_str(), score)\
541 int CSG_mRMR::Pool_Compare(
const void *a,
const void *b)
543 if( ((TPool *)a)->mival < ((TPool *)b)->mival )
546 if( ((TPool *)a)->mival > ((TPool *)b)->mival )
553 bool CSG_mRMR::Get_Selection(
int nFeatures,
int Method)
555 m_pSelection->Del_Records();
571 long poolUseFeaLen = 500;
572 if( poolUseFeaLen > m_nVars - 1 )
573 poolUseFeaLen = m_nVars - 1;
575 if( nFeatures > poolUseFeaLen )
576 nFeatures = poolUseFeaLen;
578 long *feaInd =
new long[nFeatures];
590 TPool *Pool = (TPool *)
SG_Malloc(m_nVars *
sizeof(TPool));
602 for(i=0; i<m_nVars; i++)
604 Pool[i].mival = -Get_MutualInfo(0, i);
609 qsort(Pool + 1, m_nVars - 1,
sizeof(TPool), Pool_Compare);
611 Pool[0].mival = -Pool[0].mival;
614 SG_T(
"\nTarget classification variable (#%d column in the input data) has name=%s \tentropy score=%5.3f"),
615 0 + 1, m_VarNames[0].c_str(), Pool[0].mival
618 ADD_MESSAGE(
"\n*** MaxRel features ***");
619 ADD_MESSAGE(
"Order\tFea\tName\tScore");
621 for(i=1; i<m_nVars-1; i++)
623 Pool[i].mival = -Pool[i].mival;
628 i, (
int)Pool[i].Index, m_VarNames[(
int)Pool[i].Index].c_str(), Pool[i].mival
636 long poolFeaIndMin = 1;
637 long poolFeaIndMax = poolFeaIndMin + poolUseFeaLen - 1;
639 feaInd[0] = Pool[1].Index;
640 Pool[feaInd[0]].Mask = 0;
643 ADD_MESSAGE(
"\n*** mRMR features ***");
644 ADD_MESSAGE(
"Order\tFea\tName\tScore");
646 ADD_FEATURE(0, Pool[1].mival);
648 for(k=1; k<nFeatures; k++)
650 double relevanceVal, redundancyVal, tmpscore, selectscore;
652 int b_firstSelected = 0;
654 for(i=poolFeaIndMin; i<=poolFeaIndMax; i++)
656 if( Pool[Pool[i].Index].Mask == 0 )
661 relevanceVal = Get_MutualInfo(0, Pool[i].Index);
666 redundancyVal += Get_MutualInfo(feaInd[j], Pool[i].Index);
674 case SG_mRMR_Method_MID: tmpscore = relevanceVal - redundancyVal;
break;
675 case SG_mRMR_Method_MIQ: tmpscore = relevanceVal / (redundancyVal + 0.0001);
break;
678 if( b_firstSelected == 0 )
680 selectscore = tmpscore;
681 selectind = Pool[i].Index;
684 else if( tmpscore > selectscore )
686 selectscore = tmpscore;
687 selectind = Pool[i].Index;
691 feaInd[k] = selectind;
692 Pool[selectind].Mask = 0;
694 ADD_FEATURE(k, selectscore);
707 double CSG_mRMR::Get_MutualInfo(
long v1,
long v2)
718 if( v1 >= m_nVars || v2 >= m_nVars || v1 < 0 || v2 < 0 )
728 int *v1data =
new int[m_nSamples];
729 int *v2data =
new int[m_nSamples];
731 if( !v1data || !v2data )
738 for(
long i=0; i<m_nSamples; i++)
740 v1data[i] = (int)m_Samples[i][v1];
741 v2data[i] = (int)m_Samples[i][v2];
749 int nstate1 = 0, nstate2 = 0;
751 double *pab = Get_JointProb(v1data, v2data, m_nSamples, nstate, nstate1, nstate2);
753 mi = Get_MutualInfo(pab, nstate1, nstate2);
758 DELETE_ARRAY(v1data);
759 DELETE_ARRAY(v2data);
766 double CSG_mRMR::Get_MutualInfo(
double *pab,
long pabhei,
long pabwid)
781 double **pab2d =
new double *[pabwid];
783 for(j=0; j<pabwid; j++)
785 pab2d[j] = pab + (long)j * pabhei;
788 double *p1 = 0, *p2 = 0;
789 long p1len = 0, p2len = 0;
790 int b_findmarginalprob = 1;
795 if( b_findmarginalprob != 0 )
799 p1 =
new double[p1len];
800 p2 =
new double[p2len];
802 for (i = 0; i < p1len; i++) p1[i] = 0;
803 for (j = 0; j < p2len; j++) p2[j] = 0;
805 for (i = 0; i < p1len; i++)
807 for (j = 0; j < p2len; j++)
810 p1[i] += pab2d[j][i];
811 p2[j] += pab2d[j][i];
823 for (j = 0; j < pabwid; j++)
825 for (i = 0; i < pabhei; i++)
827 if (pab2d[j][i] != 0 && p1[i] != 0 && p2[j] != 0)
829 muInf += pab2d[j][i] * log (pab2d[j][i] / p1[i] / p2[j]);
842 if( b_findmarginalprob != 0 )
857 template <
class T>
double * CSG_mRMR::Get_JointProb(T * img1, T * img2,
long len,
long maxstatenum,
int &nstate1,
int &nstate2)
864 if (!img1 || !img2 || len < 0)
872 int b_returnprob = 1;
877 int *vec1 =
new int[len];
878 int *vec2 =
new int[len];
887 int nrealstate1 = 0, nrealstate2 = 0;
889 Copy_Vector(img1, len, vec1, nrealstate1);
890 Copy_Vector(img2, len, vec2, nrealstate2);
893 nstate1 = (nstate1 < nrealstate1) ? nrealstate1 : nstate1;
895 nstate2 = (nstate2 < nrealstate2) ? nrealstate2 : nstate2;
904 double *hab =
new double[nstate1 * nstate2];
905 double **hab2d =
new double *[nstate2];
914 for(j=0; j<nstate2; j++)
916 hab2d[j] = hab + (long)j * nstate1;
919 for(i=0; i<nstate1; i++)
921 for(j=0; j<nstate2; j++)
934 hab2d[vec2[i]][vec1[i]] += 1;
943 for(i=0; i<nstate1; i++)
945 for(j=0; j<nstate2; j++)
963 template <
class T>
void CSG_mRMR::Copy_Vector(T *srcdata,
long len,
int *desdata,
int &nstate)
965 if (!srcdata || !desdata)
981 maxx = minn = int (srcdata[0] + 0.5);
985 maxx = minn = int (srcdata[0] - 0.5);
992 for (i = 0; i < len; i++)
994 tmp1 = double (srcdata[i]);
995 tmp = (tmp1 > 0) ? (
int) (tmp1 + 0.5) : (int) (tmp1 - 0.5);
996 minn = (minn < tmp) ? minn : tmp;
997 maxx = (maxx > tmp) ? maxx : tmp;
1003 for (i = 0; i < len; i++)
1009 nstate = (maxx - minn + 1);