Source: src/fingerprint.js

/**
 * Fingerprint Module.
 * The concept for fingerprinting is taken from Open refine
 * https://github.com/OpenRefine/OpenRefine/wiki/Clustering-In-Depth
 * @class fingerprint
 */

'use strict';

let meta = require('metaphone'),
  cologne = require('cologne-phonetic'),
  snowball = require('node-snowball')

let fingerprint = (function () {
 
  let module = {},
    punct = /[~`!@#$%^&*(){}\[\];:"'<,.>?\/\\|_+=-]/g, //eslint-disable-line no-useless-escape
    printable = /[^A-Za-z0-9\s]+/g,
    whitespace = /\s/g

/**
 * Transform string into key.
 * @name key
 * @function
 * @memberOf fingerprint
 * @param {string} str - String to be transformed.
 * @param {string} type - normal or phonetic.
 * @param {object} params - if type is === phonetic then {lang:'german'||'other' , stemming:true||false} can be provided.
 * @return {string} The key.
 */
  module.key = (str, type = 'normal', params = {
    'lang': 'german', 
    'stemming': false 
  }) => {
		if(str === null || !str){
      throw new Error('key function requires a string to work')
    }else{
      str = str.trim() //eslint-disable-line no-param-reassign
      if(type=='normal'){
        str = str.toLowerCase() //eslint-disable-line no-param-reassign
      }
      str = str.replace(punct, '') //eslint-disable-line no-param-reassign
      if(type=='normal'){
        str = asciify(str) //eslint-disable-line no-param-reassign
      }
      str = str.replace(printable, '') //eslint-disable-line no-param-reassign
      let frags = str.split(whitespace)
      let tree = []
      frags.forEach(f => {
        if(tree.indexOf(f)==-1){
          tree.push(f)
        }
      })

      if(type == 'phonetic'){
        if(('stemming' in params) && params.stemming){
          tree.forEach((t,ti)=>{
            tree[ti] = snowball.stemword(t, params.lang)    
          })
        }
        
        tree.forEach((t,ti)=>{
          if(('lang' in params) && params.lang == 'german'){
            tree[ti] = cologne(t)
          }else{
            tree[ti] = meta(t)
          }
        })
      }

      tree.sort()
      return tree.join('')
    }
  }

/**
 * Asciify characters (for special lang chars).
 * @name asciify
 * @function
 * @memberOf fingerprint
 * @param {string} str - String to be transformed.
 * @return {string} Asciified string.
 */
   const asciify = str => {
    let chars = str.split('')
    chars.forEach((char,ci) => {
      chars[ci] = translate(char)
    })
    return chars.join('')
  }

  /*
   * Translate the given unicode char in the closest ASCII representation
   * NOTE: this function deals only with latin-1 supplement and latin-1 extended code charts
   */
  const translate = char => {
    let translations = [
      [
      ['À','Á','Â','Ã','Ä','Å','à','á','â','ã','ä','å','Ā','ā','Ă','ă','Ą','ą'],
      'a'
      ],
      [
      ['Ç','ç','Ć','ć','Ĉ','ĉ','Ċ','ċ','Č','č'],
      'c'
      ],
      [
      ['Ð','ð','Ď','ď','Đ','đ'],
      'd'
      ],
      [
      ['È','É','Ê','Ë','è','é','ê','ë','Ē','ē','Ĕ','ĕ','Ė','ė','Ę','ę','Ě','ě'],
      'e'
      ],
      [
      ['Ĝ','ĝ','Ğ','ğ','Ġ','ġ','Ģ','ģ'],
      'g'
      ],
      [
      ['Ĥ','ĥ','Ħ','ħ'],
      'h'
      ],
      [
      ['Ì','Í','Î','Ï','ì','í','î','ï','Ĩ','ĩ','Ī','ī','Ĭ','ĭ','Į','į','İ','ı'],
      'i'
      ],
      [
      ['Ĵ','ĵ'],
      'j'
      ],
      [
      ['Ķ','ķ','ĸ'],
      'k'
      ],
      [
      ['Ĺ','ĺ','Ļ','ļ','Ľ','ľ','Ŀ','ŀ','Ł','ł'],
      'l'
      ],
      [
      ['Ñ','ñ','Ń','ń','Ņ','ņ','Ň','ň','ʼn','Ŋ','ŋ'],
      'n'
      ],
      [
      ['Ò','Ó','Ô','Õ','Ö','Ø','ò','ó','ô','õ','ö','ø','Ō','ō','Ŏ','ŏ','Ő','ő'],
      'o'
      ],
      [
      ['Ŕ','ŕ','Ŗ','ŗ','Ř','ř'],
      'r'
      ],
      [
      ['Ś','ś','Ŝ','ŝ','Ş','ş','Š','š','ſ','ß'],
      's'
      ],
      [
      ['Ţ','ţ','Ť','ť','Ŧ','ŧ'],
      't'
      ],
      [
      ['Ù','Ú','Û','Ü','ù','ú','û','ü','Ũ','ũ','Ū','ū','Ŭ','ŭ','Ů','ů','Ű','ű','Ų','ų'],
      'u'
      ],
      [
      ['Ŵ','ŵ'],
      'w'
      ],
      [
      ['Ý','ý','ÿ','Ŷ','ŷ','Ÿ'],
      'y'
      ],
      [
      ['Ź','ź','Ż','ż','Ž','ž'],
      'z'
      ]
    ]

    let tChar = false

    translations.forEach(t => {
      if(t[0].indexOf(char)>-1){
        tChar = t[1] //eslint-disable-line prefer-destructuring
      }
    })

    return(!tChar) ? char : tChar
  }

/**
 * Analyse an array of previously keyed strings.
 * @name analyse
 * @function
 * @memberOf fingerprint
 * @param {array} data - String to be transformed.
 * @param {string} type - normal or phonetic.
 * @param {object} params - if type is === phonetic then {lang:'german'||'other' , stemming:true||false} can be provided.
 * @return {object} map.
 */
   module.analyse = (data, type = 'normal', params = {
    'lang': 'german', 
    'stemming': false 
  }) => {
    let map = {}
    data.forEach((d,di)=>{
      let key = module.key(d, type, params)
      if(!(key in map)){
        map[key] = []
      }
      map[key].push({
        'id': di, 
        'label': d
      })
    })
    return map
  }

/**
 * Cluster results of module.analyse.
 * @name cluster
 * @function
 * @memberOf fingerprint
 * @param {object} map - created in module.analyse.
 * @return {object} Clustered map.
 */
  module.cluster = map => {
    let clusters = []
    for(let key in map){
      let cluster = {}

      map[key].forEach( m => {
        if(!(m.label in cluster)){
          cluster[m.label] = {
            'ids': [],
            'ok': 1
          }
        }

        cluster[m.label].ids.push(m.id)
      })

      let max = -Number.MAX_VALUE,
        last_max = false

      for(let kkey in cluster){
        let l = cluster[kkey].ids.length
        if(l>max){
          max = l
          cluster[kkey].ok = 2
          if(last_max){
            cluster[last_max].ok = 1
          }
          last_max = kkey
        }
      }

      clusters.push({
        cluster, 
        key
      })
    }

    return clusters
  }

/**
 * Translates the cluster from module.cluster into an easy to read and edit object.
 * @name readableCluster
 * @function
 * @memberOf fingerprint
 * @param {object} clusters - created in module.cluster.
 * @return {object} Clustered map.
 */
  module.readableCluster = clusters => {
    let readable = []

    clusters.forEach(c=>{
      let cluster = []
      for(let key in c.cluster){
        cluster.push({
          'c': c.cluster[key].ids.length, 
          'ids': c.cluster[key].ids, 
          'label': key,
          'ok': c.cluster[key].ok
        })
      }
      readable.push(cluster)
    })

    return readable
  }
 
  return module;

})()

module.exports = fingerprint