www.iramuteq.org Git - iramuteq/blob - documentation/similitude.txt

   1       names Jaccard, binary, Reyssac, Roux
   2         FUN R_bjaccard
   3    distance FALSE
   4      PREFUN pr_Jaccard_prefun
   5     POSTFUN NA
   6     convert pr_simil2dist
   7        type binary
   8        loop FALSE
   9       C_FUN TRUE
  10        abcd FALSE
  11     formula a / (a + b + c)
  12   reference Jaccard, P. (1908). Nouvelles recherches sur la
  13             distribution florale. Bull. Soc. Vaud. Sci. Nat., 44, pp.
  14             223--270.
  15 description The Jaccard Similarity (C implementation) for binary data.
  16             It is the proportion of (TRUE, TRUE) pairs, but not
  17             considering (FALSE, FALSE) pairs. So it compares the
  18             intersection with the union of object sets.
  19
  20       names Kulczynski1
  21         FUN pr_Kulczynski1
  22    distance FALSE
  23      PREFUN NA
  24     POSTFUN NA
  25     convert pr_simil2dist
  26        type binary
  27        loop TRUE
  28       C_FUN FALSE
  29        abcd TRUE
  30     formula a / (b + c)
  31   reference Kurzcynski, T.W. (1970). Generalized distance and discrete
  32             variables. Biometrics, 26, pp. 525--534.
  33 description Kulczynski Similarity for binary data. Relates the (TRUE,
  34             TRUE) pairs to discordant pairs.
  35
  36       names Kulczynski2
  37         FUN pr_Kulczynski2
  38    distance FALSE
  39      PREFUN NA
  40     POSTFUN NA
  41     convert pr_simil2dist
  42        type binary
  43        loop TRUE
  44       C_FUN FALSE
  45        abcd TRUE
  46     formula [a / (a + b) + a / (a + c)] / 2
  47   reference Kurzcynski, T.W. (1970). Generalized distance and discrete
  48             variables. Biometrics, 26, pp. 525--534.
  49 description Kulczynski Similarity for binary data. Relates the (TRUE,
  50             TRUE) pairs to the discordant pairs.
  51
  52       names Mountford
  53         FUN pr_Mountford
  54    distance FALSE
  55      PREFUN NA
  56     POSTFUN NA
  57     convert pr_simil2dist
  58        type binary
  59        loop TRUE
  60       C_FUN FALSE
  61        abcd TRUE
  62     formula 2a / (ab + ac + 2bc)
  63   reference Mountford, M.D. (1962). An index of similarity and its
  64             application to classificatory probems. In P.W. Murphy
  65             (ed.), Progress in Soil Zoology, pp. 43--50. Butterworth,
  66             London.
  67 description The Mountford Similarity for binary data.
  68
  69       names Fager, McGowan
  70         FUN pr_fagerMcgowan
  71    distance FALSE
  72      PREFUN NA
  73     POSTFUN NA
  74     convert pr_simil2dist
  75        type binary
  76        loop TRUE
  77       C_FUN FALSE
  78        abcd TRUE
  79     formula a / sqrt((a + b)(a + c)) - 1 / 2 sqrt(a + c)
  80   reference Fager, E. W. and McGowan, J. A. (1963). Zooplankton species
  81             groups in the North Pacific. Science, N. Y. 140: 453-460
  82 description The Fager / McGowan distance.
  83
  84       names Russel, Rao
  85         FUN pr_RusselRao
  86    distance FALSE
  87      PREFUN NA
  88     POSTFUN NA
  89     convert pr_simil2dist
  90        type binary
  91        loop TRUE
  92       C_FUN FALSE
  93        abcd TRUE
  94     formula a / n
  95   reference Russell, P.F., and Rao T.R. (1940). On habitat and
  96             association of species of anopheline larvae in
  97             southeastern, Madras, J. Malaria Inst. India 3, pp.
  98             153--178
  99 description The Russel/Rao Similarity for binary data. It is just the
 100             proportion of (TRUE, TRUE) pairs.
 101
 102       names simple matching, Sokal/Michener
 103         FUN pr_SimpleMatching
 104    distance FALSE
 105      PREFUN NA
 106     POSTFUN NA
 107     convert pr_simil2dist
 108        type binary
 109        loop TRUE
 110       C_FUN FALSE
 111        abcd TRUE
 112     formula (a + d) / n
 113   reference Sokal, R.R., and Michener, C.D. (1958). A statistical
 114             method for evaluating systematic relationships. Univ.
 115             Kansas Sci. Bull., 39, pp. 1409--1438.
 116 description The Simple Matching Similarity or binary data. It is the
 117             proportion of concordant pairs.
 118
 119       names Hamman
 120         FUN pr_Hamman
 121    distance FALSE
 122      PREFUN NA
 123     POSTFUN NA
 124     convert pr_simil2dist
 125        type binary
 126        loop TRUE
 127       C_FUN FALSE
 128        abcd TRUE
 129     formula ([a + d] - [b + c]) / n
 130   reference Hamann, U. (1961). Merkmalbestand und
 131             Verwandtschaftsbeziehungen der Farinosae. Ein Beitrag zum
 132             System der Monokotyledonen. Willdenowia, 2, pp. 639-768.
 133 description The Hamman Matching Similarity for binary data. It is the
 134             proportion difference of the concordant and discordant
 135             pairs.
 136
 137       names Faith
 138         FUN pr_Faith
 139    distance FALSE
 140      PREFUN NA
 141     POSTFUN NA
 142     convert pr_simil2dist
 143        type binary
 144        loop TRUE
 145       C_FUN FALSE
 146        abcd TRUE
 147     formula (a + d/2) / n
 148   reference Belbin, L., Marshall, C. & Faith, D.P. (1983). Representing
 149             relationships by automatic assignment of colour. The
 150             Australian Computing Journal 15, 160-163.
 151 description The Faith similarity
 152
 153       names Tanimoto, Rogers
 154         FUN pr_RogersTanimoto
 155    distance FALSE
 156      PREFUN NA
 157     POSTFUN NA
 158     convert pr_simil2dist
 159        type binary
 160        loop TRUE
 161       C_FUN FALSE
 162        abcd TRUE
 163     formula (a + d) / (a + 2b + 2c + d)
 164   reference Rogers, D.J, and Tanimoto, T.T. (1960). A computer program
 165             for classifying plants. Science, 132, pp. 1115--1118.
 166 description The Rogers/Tanimoto Similarity for binary data. Similar to
 167             the simple matching coefficient, but putting double weight
 168             on the discordant pairs.
 169
 170       names Dice, Czekanowski, Sorensen
 171         FUN pr_Dice
 172    distance FALSE
 173      PREFUN NA
 174     POSTFUN NA
 175     convert pr_simil2dist
 176        type binary
 177        loop TRUE
 178       C_FUN FALSE
 179        abcd TRUE
 180     formula 2a / (2a + b + c)
 181   reference Dice, L.R. (1945). Measures of the amount of ecologic
 182             association between species. Ecolology, 26, pp. 297--302.
 183 description The Dice Similarity
 184
 185       names Phi
 186         FUN pr_Phi
 187    distance FALSE
 188      PREFUN NA
 189     POSTFUN NA
 190     convert pr_simil2dist
 191        type binary
 192        loop TRUE
 193       C_FUN FALSE
 194        abcd TRUE
 195     formula (ad - bc) / sqrt[(a + b)(c + d)(a + c)(b + d)]
 196   reference Sokal, R.R, and Sneath, P.H.A. (1963). Principles of
 197             numerical taxonomy. W.H. Freeman and Company, San
 198             Francisco.
 199 description The Phi Similarity (= Product-Moment-Correlation for binary
 200             variables)
 201
 202       names Stiles
 203         FUN pr_Stiles
 204    distance FALSE
 205      PREFUN NA
 206     POSTFUN NA
 207     convert pr_simil2dist
 208        type binary
 209        loop TRUE
 210       C_FUN FALSE
 211        abcd TRUE
 212     formula log(n(|ad-bc| - 0.5n)^2 / [(a + b)(c + d)(a + c)(b + d)])
 213   reference Stiles, H.E. (1961). The association factor in information
 214             retrieval. Communictions of the ACM, 8, 1, pp. 271--279.
 215 description The Stiles Similarity (used for information retrieval).
 216             Identical to the logarithm of Krylov's distance.
 217
 218       names Michael
 219         FUN pr_Michael
 220    distance FALSE
 221      PREFUN NA
 222     POSTFUN NA
 223     convert pr_simil2dist
 224        type binary
 225        loop TRUE
 226       C_FUN FALSE
 227        abcd TRUE
 228     formula 4(ad - bc) / [(a + d)^2 + (b + c)^2]
 229   reference Cox, T.F., and Cox, M.A.A. (2001). Multidimensional
 230             Scaling. Chapmann and Hall.
 231 description The Michael Similarity
 232
 233       names Mozley, Margalef
 234         FUN pr_MozleyMargalef
 235    distance FALSE
 236      PREFUN NA
 237     POSTFUN NA
 238     convert pr_simil2dist
 239        type binary
 240        loop TRUE
 241       C_FUN FALSE
 242        abcd TRUE
 243     formula an / (a + b)(a + c)
 244   reference Margalef, D.R. (1958). Information theory in ecology. Gen.
 245             Systems, 3, pp. 36--71.
 246 description The Mozley/Margalef Similarity
 247
 248       names Yule
 249         FUN pr_Yule
 250    distance FALSE
 251      PREFUN NA
 252     POSTFUN NA
 253     convert pr_simil2dist
 254        type binary
 255        loop TRUE
 256       C_FUN FALSE
 257        abcd TRUE
 258     formula (ad - bc) / (ad + bc)
 259   reference Yule, G.U. (1912). On measuring associations between
 260             attributes. J. Roy. Stat. Soc., 75, pp. 579--642.
 261 description Yule Similarity
 262
 263       names Yule2
 264         FUN pr_Yule2
 265    distance FALSE
 266      PREFUN NA
 267     POSTFUN NA
 268     convert pr_simil2dist
 269        type binary
 270        loop TRUE
 271       C_FUN FALSE
 272        abcd TRUE
 273     formula (sqrt(ad) - sqrt(bc)) / (sqrt(ad) + sqrt(bc))
 274   reference Yule, G.U. (1912). On measuring associations between
 275             attributes. J. Roy. Stat. Soc., 75, pp. 579--642.
 276 description Yule Similarity
 277
 278       names Ochiai
 279         FUN pr_Ochiai
 280    distance FALSE
 281      PREFUN NA
 282     POSTFUN NA
 283     convert pr_simil2dist
 284        type binary
 285        loop TRUE
 286       C_FUN FALSE
 287        abcd TRUE
 288     formula a / sqrt[(a + b)(a + c)]
 289   reference Sokal, R.R, and Sneath, P.H.A. (1963). Principles of
 290             numerical taxonomy. W.H. Freeman and Company, San
 291             Francisco.
 292 description The Ochiai Similarity
 293
 294       names Simpson
 295         FUN pr_Simpson
 296    distance FALSE
 297      PREFUN NA
 298     POSTFUN NA
 299     convert pr_simil2dist
 300        type binary
 301        loop TRUE
 302       C_FUN FALSE
 303        abcd TRUE
 304     formula a / min{(a + b), (a + c)}
 305   reference Simpson, G.G. (1960). Notes on the measurement of faunal
 306             resemblance. American Journal of Science 258-A: 300-311.
 307 description The Simpson Similarity (used in Zoology).
 308
 309       names Braun-Blanquet
 310         FUN pr_BraunBlanquet
 311    distance FALSE
 312      PREFUN NA
 313     POSTFUN NA
 314     convert pr_simil2dist
 315        type binary
 316        loop TRUE
 317       C_FUN FALSE
 318        abcd TRUE
 319     formula a / max{(a + b), (a + c)}
 320   reference Braun-Blanquet, J. (1964): Pflanzensoziologie. Springer
 321             Verlag, Wien and New York.
 322 description The Braun-Blanquet Similarity (used in Biology).
 323 #########################################################################@
 324       names cosine, angular
 325         FUN R_cosine
 326    distance FALSE
 327      PREFUN pr_cos_prefun
 328     POSTFUN NA
 329     convert pr_simil2dist
 330        type metric
 331        loop FALSE
 332       C_FUN TRUE
 333        abcd FALSE
 334     formula xy / sqrt(xx * yy)
 335   reference Anderberg, M.R. (1973). Cluster Analysis for Applicaitons.
 336             Academic Press.
 337 description The cos Similarity (C implementation)
 338       names eJaccard, extended_Jaccard
 339         FUN R_ejaccard
 340    distance FALSE
 341      PREFUN pr_eJaccard_prefun
 342     POSTFUN NA
 343     convert pr_simil2dist
 344        type metric
 345        loop FALSE
 346       C_FUN TRUE
 347        abcd FALSE
 348     formula xy / (xx + yy - xy)
 349   reference Strehl A. and Ghosh J. (2000). Value-based customer
 350             grouping from large retail data-sets. In Proc. SPIE
 351             Conference on Data Mining and Knowledge Discovery, Orlando,
 352             volume 4057, pages 33-42. SPIE.
 353 description The extended Jaccard Similarity (C implementation; yields
 354             Jaccard for binary x,y).
 355       names fJaccard, fuzzy_Jaccard
 356         FUN R_fuzzy_dist
 357    distance FALSE
 358      PREFUN pr_fJaccard_prefun
 359     POSTFUN NA
 360     convert pr_simil2dist
 361        type metric
 362        loop FALSE
 363       C_FUN TRUE
 364        abcd FALSE
 365     formula sum_i (min{x_i, y_i} / max{x_i, y_i})
 366   reference Miyamoto S. (1990). Fuzzy sets in information retrieval and
 367             cluster analysis, Kluwer Academic Publishers, Dordrecht.
 368 description The fuzzy Jaccard Similarity (C implementation).
 369       names correlation
 370         FUN pr_cor
 371    distance FALSE
 372      PREFUN NA
 373     POSTFUN NA
 374     convert pr_simil2dist
 375        type metric
 376        loop TRUE
 377       C_FUN FALSE
 378        abcd FALSE
 379     formula xy / sqrt(xx * yy) for centered x,y
 380   reference Anderberg, M.R. (1973). Cluster Analysis for Applicaitons.
 381             Academic Press.
 382 description correlation (taking n instead of n-1 for the variance)
 383 ######################################################################
 384       names Chi-squared
 385         FUN pr_ChiSquared
 386    distance FALSE
 387      PREFUN NA
 388     POSTFUN NA
 389     convert pr_simil2dist
 390        type nominal
 391        loop TRUE
 392       C_FUN FALSE
 393        abcd FALSE
 394     formula sum_ij (o_i - e_i)^2 / e_i
 395   reference Anderberg, M.R. (1973). Cluster Analysis for Applicaitons.
 396             Academic Press.
 397 description Sum of standardized squared deviations from observed and
 398             expected values in a cross-tab for x and y.
 399
 400       names Phi-squared
 401         FUN pr_PhiSquared
 402    distance FALSE
 403      PREFUN NA
 404     POSTFUN NA
 405     convert pr_simil2dist
 406        type nominal
 407        loop TRUE
 408       C_FUN FALSE
 409        abcd FALSE
 410     formula [sum_ij (o_i - e_i)^2 / e_i] / n
 411   reference Anderberg, M.R. (1973). Cluster Analysis for Applicaitons.
 412             Academic Press.
 413 description Standardized Chi-Squared (= Chi / n).
 414
 415       names Tschuprow
 416         FUN pr_Tschuprow
 417    distance FALSE
 418      PREFUN NA
 419     POSTFUN NA
 420     convert pr_simil2dist
 421        type nominal
 422        loop TRUE
 423       C_FUN FALSE
 424        abcd FALSE
 425     formula sqrt{[sum_ij (o_i - e_i)^2 / e_i] / n / sqrt((p - 1)(q -
 426             1))}
 427   reference Tschuprow, A.A. (1925). Grundbegriffe und Grundprobleme der
 428             Korrelationstheorie. Springer.
 429 description Tschuprow-standardization of Chi-Squared.
 430
 431       names Cramer
 432         FUN pr_Cramer
 433    distance FALSE
 434      PREFUN NA
 435     POSTFUN NA
 436     convert pr_simil2dist
 437        type nominal
 438        loop TRUE
 439       C_FUN FALSE
 440        abcd FALSE
 441     formula sqrt{[Chi / n)] / min[(p - 1), (q - 1)]}
 442   reference Cramer, H. (1946). The elements of probability theory and
 443             some of its applications. Wiley, New York.
 444 description Cramer-standization of Chi-Squared.
 445
 446       names Pearson, contingency
 447         FUN pr_Pearson
 448    distance FALSE
 449      PREFUN NA
 450     POSTFUN NA
 451     convert pr_simil2dist
 452        type nominal
 453        loop TRUE
 454       C_FUN FALSE
 455        abcd FALSE
 456     formula sqrt{Chi / (n + Chi)}
 457   reference Anderberg, M.R. (1973). Cluster Analysis for Applicaitons.
 458             Academic Press.
 459 description Contingency Coefficient. Chi is the Chi-Squared statistic.
 460
 461       names Gower
 462         FUN pr_Gower
 463    distance FALSE
 464      PREFUN pr_Gower_prefun
 465     POSTFUN NA
 466     convert pr_simil2dist
 467        type NA
 468        loop TRUE
 469       C_FUN FALSE
 470        abcd FALSE
 471     formula Sum_k (s_ijk * w_k) / Sum_k (d_ijk * w_k)
 472   reference Gower, J.C. (1971). A general coefficient of similarity and
 473             some of its properties. Biometrics, 27, pp. 857--871.
 474 description The Gower Similarity for mixed variable types. w_k are
 475             variable weights. d_ijk is 0 for missings or a pair of
 476             FALSE logicals, and 1 else. s_ijk is 1 for a pair of TRUE
 477             logicals or matching factor levels, and the absolute
 478             difference for metric variables. Each metric variable is
 479             scaled with its corresponding range, provided the latter is
 480             not 0. Ordinal variables are converted to ranks r_i and the
 481             scores z_i = (r_i - 1) / (max r_i - 1) are taken as metric
 482             variables. Note that in the latter case, unlike the
 483             definition of Gower, just the internal integer codes are
 484             taken as the ranks, and not what rank() would return. This
 485             is for compatibility with daisy() of the cluster package,
 486             and will make a slight difference in case of ties. The
 487             weights w_k can be specified by passing a numeric vector
 488             (recycled as needed) to the 'weights' argument. Ranges for
 489             scaling the columns of x and y can be specified using the
 490             'ranges.x'/'ranges.y' arguments (or simply 'ranges' for
 491             both x and y).