#!/usr/bin/env python import sys,os,string,math,random infinity = 1000000000.0 class theobj: def printit(): print def mymax(a,b): if a>b: return a else: return b # C++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++C # C C # C HIERARCHICAL CLUSTERING using (user-specified) criterion. C # C C # C Parameters: C # C C # C DATA(N,M) input data matrix, C # C DISS(LEN) dissimilarities in lower half diagonal C # C storage; LEN = N.N-1/2, C # C IOPT clustering criterion to be used, C # C IA, IB, CRIT history of agglomerations; dimensions C # C N, first N-1 locations only used, C # C MEMBR, NN, DISNN vectors of length N, used to store C # C cluster cardinalities, current nearest C # C neighbour, and the dissimilarity assoc. C # C with the latter. C # C FLAG boolean indicator of agglomerable obj./ C # C clusters. C # C C # C F. Murtagh, ESA/ESO/STECF, Garching, February 1986. C # C C # C------------------------------------------------------------C def hc(n,diss): mergelist = '' # n is the number of objects to be clustered # SUBROUTINE HC(N,M,LEN,IOPT,DATA,IA,IB,CRIT,MEMBR,NN,DISNN, # X FLAG,DISS) # REAL DATA(N,M),MEMBR(N),DISS(LEN) # INTEGER IA(N),IB(N) # REAL CRIT(N) # DIMENSION NN(N),DISNN(N) # LOGICAL FLAG(N) # REAL INF # DATA INF/1.E+20/ # C # C Initializations # C # DO I=1,N # MEMBR(I)=1. # FLAG(I)=.TRUE. # ENDDO # NCL=N membr = {} flag = {} for i in range(n): membr[i] = 1 flag[i] = 1 # C # C Construct dissimilarity matrix # C # DO I=1,N-1 # DO J=I+1,N # IND=IOFFSET(N,I,J) # DISS(IND)=0. # DO K=1,M # DISS(IND)=DISS(IND)+(DATA(I,K)-DATA(J,K))**2 # ENDDO # IF (IOPT.EQ.1) DISS(IND)=DISS(IND)/2. # C (Above is done for the case of the min. var. method # C where merging criteria are defined in terms of variances # C rather than distances.) # ENDDO # ENDDO # C # C Carry out an agglomeration - first create list of NNs # C # DO I=1,N-1 # DMIN=INF # DO J=I+1,N # IND=IOFFSET(N,I,J) # IF (DISS(IND).GE.DMIN) GOTO 500 # DMIN=DISS(IND) # JM=J # 500 CONTINUE # ENDDO # NN(I)=JM # DISNN(I)=DMIN # ENDDO nn = {} disnn = {} for i in range(n): dmin = infinity for j in range(n): if i==j: continue if diss[i][j]>=dmin: continue dmin = diss[i][j] jm = j nn[i] = jm disnn[i] = dmin # C # 400 CONTINUE # C Next, determine least diss. using list of NNs # DMIN=INF # DO I=1,N-1 # IF (.NOT.FLAG(I)) GOTO 600 # IF (DISNN(I).GE.DMIN) GOTO 600 # DMIN=DISNN(I) # IM=I # JM=NN(I) # 600 CONTINUE # ENDDO # NCL=NCL-1 for ncl in range(n-1): #print 'ncl',ncl dmin = infinity for i in range(n): if flag[i]==0: continue if disnn[i]>=dmin: continue dmin = disnn[i] im = i jm = nn[i] # C # C This allows an agglomeration to be carried out. # C # I2=MIN0(IM,JM) # J2=MAX0(IM,JM) # IA(N-NCL)=I2 # IB(N-NCL)=J2 # CRIT(N-NCL)=DMIN if jm=dmin: continue dmin = diss[i2][k] jj = k # MEMBR(I2)=MEMBR(I2)+MEMBR(J2) # DISNN(I2)=DMIN # NN(I2)=JJ disnn[i2] = dmin nn[i2] = jj # C # C Update list of NNs insofar as this is required. # C # DO I=1,N-1 # IF (.NOT.FLAG(I)) GOTO 900 # IF (NN(I).EQ.I2) GOTO 850 # IF (NN(I).EQ.J2) GOTO 850 # GOTO 900 # 850 CONTINUE # C (Redetermine NN of I:) # DMIN=INF # DO J=I+1,N # IND=IOFFSET(N,I,J) # IF (.NOT.FLAG(J)) GOTO 870 # IF (I.EQ.J) GOTO 870 # IF (DISS(IND).GE.DMIN) GOTO 870 # DMIN=DISS(IND) # JJ=J # 870 CONTINUE # ENDDO # NN(I)=JJ # DISNN(I)=DMIN # 900 CONTINUE # ENDDO for i in range(n): if flag[i]==0: continue if nn[i]!=i2 and nn[i]!=j2: continue dmin = infinity for j in range(n): global innerloopcounter innerloopcounter += 1 #print ncl,i,j if flag[j]==0: continue if i==j: continue if diss[i][j]>=dmin: continue dmin = diss[i][j] jj = j nn[i] = jj disnn[i] = dmin return mergelist # C # C Repeat previous steps until N-1 agglomerations carried out. # C # IF (NCL.GT.1) GOTO 400 # C # C # RETURN # END def mycmp(obj1,obj2): if obj1.sim>obj2.sim: return -1 else: if obj1.sim=hashobj-1: # continue # myobj = nextbest[n][nbpointer[n]] # print 'pre-candidate',n,myobj.partner,myobj.sim # the following loop will traverse each element # in the nextbest array. so the overall # complexity of this loop per call of # n2logn is O(n^2) for i in range(hashobj): while 1: if nbpointer[i]>=hashobj-1: break myobj = nextbest[i][nbpointer[i]] #print 'main cluster',i,assign[i] #print 'second cluster',myobj.partner,assign[myobj.partner] if assign[myobj.partner]==assign[i]: # this similarity is not relevant if # the two objects are already in the # same cluster #print 'skipping (same cluster)',i,myobj.partner,myobj.sim #print 'skipping (same cluster)',assign[i],assign[myobj.partner],myobj.sim nbpointer[i] += 1 continue mysim = sim[assign[i]][assign[myobj.partner]] if mysim=hashobj-1: continue myobj = nextbest[n][nbpointer[n]] #print 'candidate',n,myobj.partner,myobj.sim,'pos',nbpointer[n] if myobj.sim>maxsim: maxsim = myobj.sim c1 = n c2 = myobj.partner assert maxsim>=0 #print 'selected candidates',c1,c2 c1 = assign[c1] c2 = assign[c2] assert c1!=c2 if c2max: max = sim[n][i] cl1 = i cl2 = n assert cl1>=0 and cl2>=0 eliminated[cl1] = 1 mergelist += `cl1`+':'+`cl2`+' ' for n in range(hashobj): if assign[n]==cl1: assign[n]=cl2 # update the similarity matrix # the row corresponding to cl2 # is updated with similarity of cl1 # if similarity of cl1 is smaller for n in range(hashobj): if eliminated[n]: continue if sim[cl1][n]max: max = sim sourcedata[n][i] = sim sourcedata[i][n] = sim assert max<=1 return sourcedata def simtodis(hashobj,oldmatrix): newmatrix = {} for n in range(hashobj): newmatrix[n] = {} for i in range(hashobj): newmatrix[n][i] = -oldmatrix[n][i] return newmatrix def simtosim(hashobj,oldmatrix): newmatrix = {} for n in range(hashobj): newmatrix[n] = {} for i in range(hashobj): newmatrix[n][i] = oldmatrix[n][i] return newmatrix def distribution(hashiteration,increment,datamode): hashobj = 0 outfile = open('timing.xls',"w") for n in range(hashiteration): hashobj += increment global innerloopcounter innerloopcounter = 0 sourcedata = computesourcedata(hashobj,datamode) dismatrix = simtodis(hashobj,sourcedata) hc(hashobj,dismatrix) cubicroot = math.log(innerloopcounter)/3 cubicroot = math.exp(cubicroot) mystring = '' mystring += `hashobj`+'\t' mystring += `innerloopcounter`+'\t' mystring += `math.sqrt(innerloopcounter)`+'\t' mystring += `cubicroot`+'\t' outfile.write(mystring+'\n') print '1 mode: c(ompare output)/t(iming of murtaghs algorithm)' print '2 data: r(andom)/h(ard)' assert len(sys.argv)==3 algorithm = sys.argv[1] data = sys.argv[2] if data[0]=='r': datamode = 'random' else: datamode = 'pathological' if algorithm[0]=='c': hashobj = 20 innerloopcounter = 0 sourcedata = computesourcedata(hashobj,datamode) simmatrix = simtosim(hashobj,sourcedata) mergerscubic = runcubic(hashobj,simmatrix) print 'cubic',mergerscubic simmatrix = simtosim(hashobj,sourcedata) mergersn2logn = n2logn(hashobj,simmatrix) if mergerscubic==mergersn2logn: print 'n2logn algorithm is correct' else: print 'n2logn algorithm is not correct:',mergerscubic dismatrix = simtodis(hashobj,sourcedata) mergersmurtagh = hc(hashobj,dismatrix) if mergerscubic==mergersmurtagh: print 'murtaghs algorithm is correct' else: print 'murtagh s algorithm is not correct',mergersmurtagh else: assert algorithm[0]=='t' hashiteration = 10 increment = 20 distribution(hashiteration,increment,datamode)