// Name: cluster.C
// Author: J. Michael Word
// Date Written: 12/3/97
// Purpose: build clusters of related items by reading records from stdin
//          where each record represents a set of connected items
// Modifications: 
// 05/07/2000 - JMW - added (optional) sort for cluster lists
//                    and flag to supress cluster number output
// 12/17/2002 - JMW - copied updated utility.C and .h files from reduce
//                    and made main return an int so would compile on OSX
// 12/31/2008 - JMW - substituted std::vector for Vector, switched to bool,
//                    and dropped swapInt

// *************************************************************
// NOTICE: This is free software and the source code is freely
// available. You are free to redistribute or modify under the
// conditions that (1) this notice is not removed or modified
// in any way and (2) any modified versions of the program are
// also available for free.
//               ** Absolutely no Warranty **
// Copyright (C) 1999 J. Michael Word
// *************************************************************

#include <iostream>
#include <vector>
#include <string>

#include <iostream>
#include <fstream>
#include <cstdlib>
using std::cout;
using std::cin;
using std::cerr;
using std::endl;

#include "DisjointSets.h"
#include "utility.h"

static char *versionString =
  "cluster: version 1.3 12/31/08, Copyright 1997-2008, J. Michael Word";

int ExpectedMax = 2000; // what is the max number of names expected?
int M           = 3079; // size of working arrays (prime > ExpectedMax)
int NumNames = 0; // count, cannot exceed M
char *FieldSep =" \t\r\n\v\f\a";
char *OutputFieldSep ="\t";
char *OutputID ="";
std::vector< char * > TheSymTab;
const int BufLen = 2000;
bool ShowSingletons = false;
bool SortCluster    = false;
bool NumericSort    = false;
bool ShowClusterNum = true;

typedef char *cptr;

char* parseCommandLine(int argc, char **argv);
void clusterHelp();
void processInput(std::istream& is);
void sortTheCluster(int n, const cptr s[], int asNum);
int compare_numbers(const void *v1, const void *v2);
int compare_text(const void *v1, const void *v2);
int genHashM( const int sz );
int insertKey(const char *key);
int hash2(const char *key, int& h2);
int nonblank(const char *key);

int main(int argc, char **argv) {
	char *inFile = parseCommandLine(argc, argv);

	if (inFile) {
		std::ifstream theinputstream(inFile);
		processInput(theinputstream);
	}
	else { processInput(cin); }
	return 0;
}

char* parseCommandLine(int argc, char **argv) {
	char *inFile = NULL;
	int nfile = 0, n;

	if (argc <= 1) {
		clusterHelp();
	}
	for (int i = 1; i < argc; i++) {
		char *p = argv[i];
		if (p[0] == '-') {
			if (p[1] == '\0') {
				nfile = 1;
				inFile = NULL; // i.e. standard input
			}
			else if(compArgStr(p+1, "Help", 1)){
				clusterHelp();
			}
			else if(compArgStr(p+1, "SINGLEtons", 6)){
				ShowSingletons = true;
			}
			else if(compArgStr(p+1, "SORT", 4)){
				SortCluster = true;
			}
			else if(compArgStr(p+1, "NUMeric", 3)){
				SortCluster = true;
				NumericSort = true;
			}
			else if(compArgStr(p+1, "SKIP", 4)){
				ShowClusterNum = false;
			}
			else if(n = compArgStr(p+1, "F", 1)){
				if (p[n+1]) {
					FieldSep = &p[n+1];
				}
				else if (++i < argc) {
					FieldSep = argv[i];
				}
				else {
					cerr << "no field separator after -F flag" << endl;
				}
			}
			else if(n = compArgStr(p+1, "O", 1)){
				if (p[n+1]) {
					OutputFieldSep = &p[n+1];
				}
				else if (++i < argc) {
					OutputFieldSep = argv[i];
				}
				else {
					cerr << "no output field separator after -F flag" << endl;
				}
			}
			else if(n = compArgStr(p+1, "N", 1)){
				if (p[n+1]) {
					OutputID = &p[n+1];
				}
				else if (++i < argc) {
					OutputID = argv[i];
				}
				else {
					cerr << "no name after -F flag" << endl;
				}
			}
			else if(n = compArgStr(p+1, "Max", 1)){
				ExpectedMax = parseInteger(p, n+1, 10);
			}
			else {
				cerr << "unrecognized flag, \"" << p << "\", ignored." << endl;
			}
		}
		else if (nfile <= 0) {
			inFile = p;
			nfile = 1;
		}
		else {
			cerr << "unrecognized parameter, \"" << p << "\", ignored." << endl;
		}
	}
	if (nfile != 1) { clusterHelp(); }

	M = genHashM(ExpectedMax);
	TheSymTab.resize(M);
	for(n = 0; n < M; n++) { TheSymTab[n] = NULL; }

	return inFile;
}

void clusterHelp() {
	cerr << versionString << endl << endl;
	cerr << "Read in lines consisting of two or more names and output" << endl;
	cerr << "connected clusters of names. Each line of output is" << endl;
	cerr << "prefixed with a cluster number, the size of the cluster" << endl;
	cerr << "and an optional name string." << endl;
	cerr << endl;
	cerr << "arguments: [-flags] filename or -"<<endl;
	cerr << endl;
	cerr <<"Flags:" << endl;
	cerr << endl;
	cerr << "-F c     input  field separator is 'c' (default is whitespace)." << endl;
	cerr << "-O c     output field separator is 'c' (default is tab)." << endl;
	cerr << "-N name  prefix cluster with name." << endl;
	cerr << "-Max#    max number of different input names (default="<<ExpectedMax<<")" << endl;
	cerr << "-SINGLEtons  include unconnected singletons in the output." << endl;
	cerr << "-SORT    sort within each cluster (ascii order)." << endl;
	cerr << "-NUMeric sort within clusters (numeric order)." << endl;
	cerr << "-SKIP    suppress display of cluster id in the first column." << endl;
	cerr << endl;
	cerr << "-Help  write out this description" << endl;
	exit(1);
}

void processInput(std::istream& is) {
	static char buf[BufLen+1];

	DisjointSets connsets(M);

	// read input

	while (is.getline(buf, BufLen)) {
		char *p = strtok(buf,FieldSep);
		int lst = -1;
		while (p) {
			if (nonblank(p)) {
				int k = insertKey(p);
				if (lst >= 0 && k >= 0) {
					connsets.connect(lst, k); }
				lst = k;
			}
			p = strtok(NULL, FieldSep);
		}
	}

	// determine clusters

	int ** djss = connsets.subsets();

	// write clusters

	int clen = 0, cvsz = 0, i = 0;
	cptr * cvec = NULL;

	cvec = new cptr[cvsz=100];

	for(int j=1; j <= djss[0][0]; j++) {
		if (ShowClusterNum) { cout << j; }
		cout << OutputFieldSep << djss[j][0]
		     << OutputFieldSep << OutputID;

		clen = djss[j][0];

		if (cvsz < clen) { // resize sort work array if neccesary
	 		delete [] cvec; 
	 		cvec = new cptr[cvsz = (clen + 10)];
		}

		for(i=0; i < clen; i++) {
			cvec[i] = TheSymTab[ djss[j][i+1] ];
		}

		if (SortCluster) {
	 		sortTheCluster(clen, cvec, NumericSort);
		}

		for(i=0; i < clen; i++) {
			cout << OutputFieldSep << cvec[i];
		}
		cout << endl;
	}
	delete [] cvec; cvec = NULL; cvsz = 0;

	if (ShowSingletons) {
		int ng = connsets.numGroups();

		for(int h=0; h < connsets.size(); h++) {
	 		if (TheSymTab[h] && connsets.singleton(h)) {
		 		++ng;
		 		if (ShowClusterNum) { cout << ng; }
		 		cout   << OutputFieldSep << 1
				       << OutputFieldSep << OutputID
				       << OutputFieldSep << TheSymTab[h] << endl;
	 		}
		}
	}

	freeDJsubsets(djss); // finished with array of subset indexes
}

void sortTheCluster(int n, const cptr s[], int asNum) {
	if (asNum) {
		qsort((void*) s, n, sizeof(cptr), compare_numbers);
	}
	else {
		qsort((void*) s, n, sizeof(cptr), compare_text);
	}
}

int compare_numbers(const void *v1, const void *v2) {

	double n1 = atof(*(const cptr *)v1);
	double n2 = atof(*(const cptr *)v2);

	// cerr << "N " << n1 << " < " << n2 << " ?" << endl;

	if      (n1 < n2) { return -1; }
	else if (n1 > n2) { return  1; }

	return 0;
}

int compare_text(const void *v1, const void *v2) {
	const char *s1 = *(const cptr *)v1;
	const char *s2 = *(const cptr *)v2;

	// cerr << "T " << s1 << " < " << s2 << " ?" << endl;

	return strcmp(s1, s2);
}

int nonblank(const char *key) {
	while(*key) {
		if (isgraph(*key)) { return 1; }
		key++;
	}
	return 0;
}

int insertKey(const char *key) {
	int h, h2;
	h = hash2(key, h2);

	while(TheSymTab[h] && strcmp(key, TheSymTab[h])) {
		h = (h+h2) % M;
	}
	if (! TheSymTab[h]) {
		if (NumNames >= M-2) {
	 cerr << "too many names, increase -Max# value" << endl;
	 exit(0);
		}
		NumNames++;
		TheSymTab[h] = strdup(key);
	}
	return h;
}

int hash2(const char *key, int& h2) {
	const char *p = NULL;
	int h = 0;
	h2 = 1;
	for (p = key; *p; p++) {
		h = (64*h + *p) % M;
	}
	h2 = (p == key) ? 1 : (16 - (p[-1]&15));
	return h;
}

// ----------------------------------------------------
//  return a prime number to be used as hash table size
// ----------------------------------------------------
int genHashM( const int sz ) {
	static int plist[] = {
		577, 1117, 1987, 3079, 4057, 6133,
		7919, 9733, 11657, 13499, 17389, 20357,
		29443, 43051, 63809, 81799, 104729, 287117, 611953, 814279,
		1299709, 2750159, 3497861, 5023307, 7368787, 15485863
	};

	int max = (sizeof(plist)/sizeof(int));
	int Mval = plist[ max - 1 ];

	for (int i=0; i < max; i++) {
		if ( sz*1.5 < plist[ i ] ) {
			Mval = plist[ i ];
			break;
		}
	}
	return Mval;
}
