export function matchSetPerc(a: Set<string>, b: Set< string>) {

    let match = 0;

    for (const _a of a) {
        if (b.has(_a)) {
            match++
        }
    }

    return match / Math.min(a.size, b.size);
}

const nlp = require('wink-nlp-utils');
const dbscan = require('@cdxoo/dbscan');

function dotProduct(vecA: number[], vecB: number[]) {
    let product = 0;
    for (let i = 0; i < vecA.length; i++) {
        product += vecA[i] * vecB[i];
    }
    return product;
}

function magnitude(vec: number[]) {
    let sum = 0;
    for (let i = 0; i < vec.length; i++) {
        sum += vec[i] * vec[i];
    }
    return Math.sqrt(sum);
}

function cosineSimilarity(vecA: number[], vecB: number[]) {
    return dotProduct(vecA, vecB) / (magnitude(vecA) * magnitude(vecB));
}

// todo do TF-IDK on names in groups to get display names
export function group<T>(query: string, items: T[], stringExtractor: (t: T) => string) {
    
    const _timeKey = `grouping ${items.length} results`;
    console.time(_timeKey)
    // console.clear()

    // todo DRY token generation
    const searchTokens: { tag: 'word' | 'number', value: string }[] = nlp.string.tokenize(query.toLowerCase(), true).filter((x: { tag: string }) => x.tag === 'word' || x.tag === 'number');
    const searchTokensSet = new Set(searchTokens.map(x => x.value));
    // console.log(searchTokens, searchTokensSet)

    const str = items.map(stringExtractor);

    // tokenize strings
    const tokens: { tag: 'word' | 'number', value: string }[][] = str.map(s => nlp.string.tokenize(s.toLowerCase(), true).filter((x: { tag: string }) => x.tag === 'word' || x.tag === 'number'));

    // create weights
    // word - 1
    // number - 2
    // anything else - 0
    const weights = new Map(tokens.flat().map(x => [
        x.value,
        x.tag === 'number'
            ? 2
            : x.tag === 'word'
                ? 1
                : 0
    ]));

    // dictionary of all tokens
    const dictionary = new Map([...new Set(tokens.flat().map(x => x.value))].map((s, i) => [s, i]));

    // bag of words multiplied by weight
    const bow = tokens.map(t => {
        const arr = new Array(dictionary.size).fill(0);
        for (const token of t) {
            const index = dictionary.get(token.value)!;
            arr[index] += weights.get(token.value)!;
        }
        return arr;
    });

    // console.log(dictionary);
    // console.table(str.map((s, i) => [s, bow[i].join(' ')]));

    // do a dbscan https://en.wikipedia.org/wiki/DBSCAN
    let dbscanResult: { clusters: number[][], noise: number[] } = dbscan({
        dataset: bow,
        // max distance for a new cluster
        epsilon: 0.2,
        // cosineSimilarity = 0->1
        // distance = 1 - cosineSimilarity
        distanceFunction: (a: number[], b: number[]) => 1 - cosineSimilarity(a, b),
        // can have clusters with 1 point
        minimumPoints: 1,
    });

    // console.log(dbscanResult);
    // console.log(dbscanResult.clusters.map(c => c.map(i => str[i])))

    const sortPriority = new Map<T, number>(tokens.map((t, i) => {
        const words = new Set(t.map(x => x.value));
        // todo optimize
        const commonWords = [...searchTokensSet].filter(x => words.has(x)).length;

        return [items[i], commonWords];
    }));

    // console.log(sortPriority)

    // todo optimize cluster and off-topic so we don't filter twice
    const data = {
        cluster: dbscanResult.clusters
            .map(indexes => indexes.map(i => items[i]))
            .filter(c => Math.max(...c.map(x => sortPriority.get(x)!)) > 0)
            .sort((a, b) => {
                const maxA = Math.max(...a.map(x => sortPriority.get(x)!))
                const maxB = Math.max(...b.map(x => sortPriority.get(x)!))

                if (maxA !== maxB) {
                    return maxB - maxA;
                }
                return b.length - a.length;
            }),
        unclustered: dbscanResult.noise.map(i => items[i]),
        offtopic: dbscanResult.clusters
            .map(indexes => indexes.map(i => items[i]))
            .filter(c => Math.max(...c.map(x => sortPriority.get(x)!)) === 0)
    }

    console.timeEnd(_timeKey);

    return data;
}
