/*
DMERGE Copyright (c) 2005 Jonathan H Lundquist  (fluxsmith@fluxsmith.com)

Permission is hereby granted, free of charge, to any person
obtaining a copy of this software, to make use of this software without 
restriction, including without limitation the rights to use, copy, modify, 
merge, publish, distribute, sublicense, and/or sell copies of this software,
and to permit persons to whom this software is furnished to do so;
subject to the following conditions:

The above copyright notice and this permission notice shall be
included in all copies whether modified or original, of this software,
or substantial portions thereof.

THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHOR OR COPYRIGHT HOLDER BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THIS
SOFTWARE OR THE USE OR OTHER DEALINGS IN THIS SOFTWARE.
*/

#include <memory>
#include <string>
#include <vector>
#include <map>
#include <list>
#include <stdio.h>
#include <stdarg.h>
#include <string.h>
#include <sys/stat.h>
#include <dirent.h>
#include <unistd.h>
#include <stdlib.h>
#include <getopt.h>
#include <errno.h>
#include <wait.h>

struct flags {
   std::string compare;
   bool dry;
   std::string hash;
   bool quiet;
   bool stay;
   bool trust;
   bool verbose;
}; 

flags flags;

struct node_info {
   mode_t      st_mode;
   uid_t       st_uid;
   gid_t       st_gid;
   std::string name;

   node_info(const std::string& name, const struct stat& s) 
      : st_mode(s.st_mode), st_uid(s.st_uid), st_gid(s.st_gid), name(name) {}
};

struct dirinfo {
   mode_t        st_mode;
   uid_t         st_uid;
   gid_t         st_gid;

   dirinfo(const node_info& i) 
      : st_mode(i.st_mode), st_uid(i.st_uid), st_gid(i.st_gid) {}
};   
bool operator<(const dirinfo& x, const dirinfo& y);

// This container ensures that we only consider one entry for
// each member of an existing group of hard links.
typedef std::map<ino_t /* inode */, node_info> c_file_nodes;

// This container is used for an initial grouping by size so that
// only files with duplicate sizes need signature hashes calculated.
struct size_key {
   off_t	size;
   dev_t	dev;
 
   inline size_key(off_t size, dev_t dev) : size(size), dev(dev) {}
};

bool operator>(const size_key& x, const size_key& y)
{
   if ( x.size > y.size )
      return true;
   if ( x.size < y.size )
      return false;
   return x.dev > y.dev;
}

typedef std::map<size_key, std::vector<c_file_nodes::iterator>, std::greater<size_key> > c_initial_candidates;

// This container groups all files which are duplicate in content
// (barring hash collisions), permissions must still be considered.

struct hash_key {
   off_t	size;
   dev_t	dev;
   std::string	hash;

   inline hash_key(const size_key& sk, const std::string& hash) : size(sk.size), dev(sk.dev), hash(hash) {}
};

bool operator>(const hash_key& x, const hash_key& y)
{
   if ( x.size > y.size )
      return true;
   if ( x.size < y.size )
      return false;
   if ( x.dev > y.dev )
      return true;
   if ( x.dev < y.dev )
      return false;
   return x.hash > y.hash;
}

typedef std::map<hash_key, std::list<c_file_nodes::iterator>, std::greater<hash_key> > c_signed_candidates;

// This contains any files which hashed to a signature matching
// another file but failed to compare identically.
typedef std::vector<c_file_nodes::iterator> c_errors;

bool operator<(const dirinfo& x, const dirinfo& y)
{
   if ( x.st_mode == y.st_mode ) {
      if ( x.st_uid == y.st_uid )
         return x.st_gid < y.st_gid;
      else
         return x.st_uid < y.st_uid;
   }
   else
      return x.st_mode < y.st_mode;
}

std::string escape(std::string  s)
{
   char replace[] = "\\ &()`'$;";
   size_t pos = 0;
   size_t end = s.length();
   while ( pos != end ) {
      if ( strchr(replace, s[pos]) ) {
         s.insert(pos, 1, '\\');
         ++pos, ++end;
      }
      ++pos;         
   }
   return s;
}

void progress(char* message, ...)
{
   if ( !(flags.quiet | flags.verbose) ) {
      va_list ap;
      va_start(ap, message);
      vfprintf(stderr, message, ap);
   }
}

void verbose(char* message, ...)
{
   if ( flags.verbose && !flags.quiet ) {
      va_list ap;
      va_start(ap, message);
      vprintf(message, ap);
   }
}

void report(char* message, ...)
{
   if ( !flags.quiet ) {
      va_list ap;
      va_start(ap, message);
      vprintf(message, ap);
   }
}

void walkdir(const std::string& dir, c_file_nodes& nodes, c_initial_candidates& candidates)
{
   static int prog;
   static char indicator[] = "-\\|/";

   verbose("Directory: %s\n", dir.c_str());
   DIR* const cd(opendir(dir.c_str()));

   if ( !cd ) {
      fprintf(stderr, "Could not chdir to %s\n", dir.c_str());
      exit(1);
   }

   struct stat info;
   stat(dir.c_str(), &info);
   dev_t device = info.st_dev;
   struct dirent* dinfo;
   while ( (dinfo = readdir(cd)) != NULL ) {
      if ( strcmp(dinfo->d_name, ".") && strcmp(dinfo->d_name, "..") ) {
         if ( !(++prog % 5) )
            progress("\b%c", indicator[prog % 4]);

         std::string file(dir);
         if ( file[file.length() - 1] != '/' )
            file += '/';
         file += dinfo->d_name;

         if ( lstat(file.c_str(), &info) == -1 )
            continue;

         if ( S_ISLNK(info.st_mode) )
            continue;
      
         if ( stat(file.c_str(), &info) == -1 )
            continue;

         if ( S_ISDIR(info.st_mode) && info.st_dev == device && !flags.stay )
            walkdir(file, nodes, candidates);
         else if ( S_ISREG(info.st_mode) ) {
            const std::pair<c_file_nodes::iterator, bool> i(nodes.insert(std::make_pair(info.st_ino, node_info(file, info))));
            if ( i.second )
               candidates[size_key(info.st_size, info.st_dev)].push_back(i.first);
         }
      }
   }

   closedir(cd);
}

template<typename Container>
void purge_singles(c_file_nodes& nodes, Container& candidates, const std::string& s)
{
   const int limit(candidates.size());
   int count = 1, percent = -1, purged = 0;
   for ( typename Container::iterator i = candidates.begin(); i != candidates.end(); ++count ) {
      int done = (int)((float)count / (float)limit * 100.0);
      if ( done != percent || !(count % 100) ) {
         percent = done;
         progress("\rAnalyzing %s groups [%d/%d] %d%%", s.c_str(), count, limit, percent);
      }
      if ( i->second.size() == 1 ) {
         progress(", Purged %d", ++purged);
         percent = -1;
         nodes.erase(i->second.back());
         const typename Container::iterator x(i);
         ++i;
         candidates.erase(x);
      }
      else
         ++i;
   }
   progress("\n");
}

std::string getsignature(const std::string& filename)
{
   char signature[256];

   std::string command(flags.hash);
   command += ' ';
   command += escape(filename);
   verbose("%s\n", command.c_str());
   FILE* result = popen(command.c_str(), "r");
   if ( result == 0 ) {
      fprintf(stderr, "Error invoking %s\n", command.c_str());
      exit(1);
   }
 
   if ( !fgets(signature, 256, result) )
      *signature = 0;

   pclose(result);
   char* const sep(strchr(signature, ' '));
   if ( sep )
      *sep = 0;

   return std::string(signature);
}

void assign_signatures(c_initial_candidates& initial, c_signed_candidates& hashed, int limit)
{
   int count = 0, percent = -1;
   for ( c_initial_candidates::iterator i = initial.begin(); i != initial.end(); ) {
      verbose("\n");
      const size_key size_key(i->first);
      std::vector<c_file_nodes::iterator>& v(i->second);
      for ( std::vector<c_file_nodes::iterator>::iterator j = v.begin(); j != v.end(); ++j ) {
         const std::string signature = getsignature((*j)->second.name);
         if ( signature.empty() )
            continue;         

         hashed[hash_key(size_key, signature)].push_back(*j);

         int done = (int)((float)++count / (float)limit * 100.0);
         if ( done != percent || !(count % 100) ) {
            percent = done;
            progress("\rHashing [%d/%d] %d%%", count, limit, percent);
         }
      }
      const c_initial_candidates::iterator x(i);
      ++i;
      initial.erase(x);
   }
   progress("\n");
}

template<typename Iterator, typename Container>
inline Iterator increment(Iterator i, const Container& s)
{
   return i == s.end() ? i : ++i;
}

void compare_candidates(c_signed_candidates& candidates, c_errors& errors, int limit)
{
   int count = 0, percent = -1;
   for ( c_signed_candidates::iterator i = candidates.begin(); i != candidates.end(); ) {
      std::list<c_file_nodes::iterator>& iNodes = i->second;
      std::vector<std::list<c_file_nodes::iterator>::iterator> error;
      const std::list<c_file_nodes::iterator>::iterator j(iNodes.begin());
      ++count;
      for ( std::list<c_file_nodes::iterator>::iterator k = increment(j, iNodes); k != iNodes.end(); ++k ) {
         int ret;
         const std::string command(std::string("cmp -s ") + escape((*j)->second.name) + ' ' + escape((*k)->second.name));
         verbose("%s\n", command.c_str());
         if ( (ret = system(command.c_str())) != 0 )
            error.push_back(k);
         if ( WIFSIGNALED(ret) && (WTERMSIG(ret) == SIGINT || WTERMSIG(ret) == SIGQUIT) ) {
            fprintf(stderr, "Signal %d from cmp\n", ret);
            exit(ret);
         }
         if ( error.size() == candidates.size() - 1 )
            error.push_back(j);

         int done = (int)((float)++count / (float)limit * 100.0);
         if ( done != percent || !(count % 100) ) {
            percent = done;
            progress("\rComparing [%d/%d] %d%% ", count, limit, percent);
         }
      }

      for ( std::vector<std::list<c_file_nodes::iterator>::iterator>::iterator l = error.begin(); l != error.end(); ++l )
         errors.push_back(**l);

      for ( std::vector<std::list<c_file_nodes::iterator>::iterator>::iterator l = error.begin(); l != error.end(); ++l )
         iNodes.erase(*l);
         
      if ( iNodes.size() == 0 ) {
         const c_signed_candidates::iterator x(i);
         ++i;
         candidates.erase(x);
      }
      else
         ++i;
   }
   progress("\n");
}

void merge_candidates(c_signed_candidates& candidates, c_signed_candidates& errors)
{
   // Populated with entries already known to contain duplicate data 
   // (to group by permissions)
   typedef std::map<dirinfo, std::vector<c_file_nodes::iterator> > c_final_groups;

   for ( c_signed_candidates::iterator i = candidates.begin(); i != candidates.end(); ++i ) {
      std::list<c_file_nodes::iterator>& group(i->second);

      // Refine grouping to include owner, group, and permissions
      c_final_groups final_groups;
      for ( std::list<c_file_nodes::iterator>::iterator j = group.begin(); j != group.end(); ++j ) {
         final_groups[(*j)->second].push_back(*j);
      }

      // Accumulate all single-element permission groups into error list
      // and remove them from merge list.
      bool bError = false;
      for ( c_final_groups::iterator j = final_groups.begin(); j != final_groups.end(); ) {
         if ( j->second.size() == 1 ) {
            bError = true;
            errors[i->first].push_back(j->second[0]);
            const c_final_groups::iterator x(j);
            ++j;
            final_groups.erase(x);
         }
         else
            ++j;
      }
      // Add a a reference entry for reporting
      if ( bError && !final_groups.empty() ) {
         std::vector<c_file_nodes::iterator>& group(final_groups.begin()->second);
         errors[i->first].push_back(group.front());
      }

      // Merge duplicates into a common inode
      for ( c_final_groups::iterator j = final_groups.begin(); j != final_groups.end(); ++j ) {
         report("Merging:\n");
         std::vector<c_file_nodes::iterator>& group(j->second);
         std::vector<c_file_nodes::iterator>::iterator k = group.begin();
         report("%s\n", (*k)->second.name.c_str());
         for ( std::vector<c_file_nodes::iterator>::iterator l = increment(k, group); l != group.end(); ++l ) {
            report("%s\n", (*l)->second.name.c_str());
            if ( !flags.dry ) {
               const std::string sTemp((*l)->second.name + "._dmerge_");
               const char* const temp(sTemp.c_str());
               const char* const source((*k)->second.name.c_str());
               const char* const target((*l)->second.name.c_str());
               
               if ( link(target, temp) == 0 ) {
                  unlink(target);
                  if ( link(source, target) == 0 )
                     unlink(temp);
                  else
                     rename(temp, target);
               }
            }
         }
      }  
   }
}

void print_permission_errors(const c_signed_candidates& errors)
{
   if ( errors.empty() )
      return;

   report("\nThe following file groups could be merged if permissions were changed:\n");
   for ( c_signed_candidates::const_iterator i = errors.begin(); i != errors.end(); ++i ) {
      const std::list<c_file_nodes::iterator>& group(i->second);
      for ( std::list<c_file_nodes::iterator>::const_reverse_iterator j = group.rbegin(); j != group.rend(); ++j ) {
         report("%s\n", (*j)->second.name.c_str());
      }
      report("\n");
   }
}

void print_errors(const c_errors& errors)
{
   if ( errors.empty() )
      return;

   report("\nThe following files had MD5 collisions:\n");
   for ( c_errors::const_iterator i = errors.begin(); i != errors.end(); ++i )
      report("%s\n", (*i)->second.name.c_str());
}
   
void help_text()
{
  printf("Usage: dmerge [options] DIRECTORY [DIRECTORY[...]]\n\n");

  printf(" -c --compare \tcompare function, default is cmp\n");
  printf(" -d --dry     \tdry run, don't actually unlink and merge files\n");    
  printf(" -h --hash    \thash function, default is md5sum, a good alternative is sha1sum\n");
  printf(" -q --quiet   \tdon't report progress and statistics (overrides verbose)\n");
  printf(" -s --stay    \tstay in specified directory(s), do not recurse\n");
  printf(" -t --trust   \ttrust hashing function, skip file comparisons\n");
  printf(" -v --verbose \treport progress in excruciating detail\n\n");
}

struct option long_options[] = 
{
  { "compare", required_argument, 0, 'c' },
  { "dry",     0,                 0, 'd' },
  { "hash",    required_argument, 0, 'h' },
  { "quiet",   0,                 0, 'q' },
  { "stay",    0,                 0, 's' },
  { "trust",   0,                 0, 't' },
  { "verbose", 0,                 0, 'v' }, 
  { 0,         0,                 0, 0 }
};

int main(int argc, char **argv) {
   flags.compare = "cmp";
   flags.hash    = "md5sum";
   int opt;
   while ( (opt = getopt_long(argc, argv, "c:dh:qstv", long_options, NULL)) != EOF ) {
      switch (opt) {
      case 'c':  // compare
         flags.compare = optarg;
         break;
      case 'd':  // dry
         flags.dry = true;
         break;
      case 'h':  // hash
         flags.hash = optarg;
         break;
      case 'q':  // quiet
         flags.quiet = true;
         break;
      case 's':  // stay
         flags.stay = true;
         break;   
      case 't':  // trust
         flags.trust = true;
         break;
      case 'v':  // verbose
         flags.verbose = true;
         break;
      default:
         fprintf(stderr, "Try `dmerge' for usage information\n");
         exit(1);
      }
   }

   if ( optind >= argc ) {
      // no directories specified
      help_text();
      exit(1);
   }

   c_file_nodes         file_nodes;
   c_signed_candidates  signed_candidates;

   {
      c_initial_candidates initial_candidates;
      progress("\rBuiding file list  ");
      for ( int x = optind; x < argc; ++x ) 
         walkdir(argv[x], file_nodes, initial_candidates);
      progress("\b\b, %d files\n", file_nodes.size());

      purge_singles(file_nodes, initial_candidates, "size");

      assign_signatures(initial_candidates, signed_candidates, file_nodes.size());
   }
   
   purge_singles(file_nodes, signed_candidates, "hash");

   c_errors errors;
   if ( !flags.trust )
      compare_candidates(signed_candidates, errors, file_nodes.size());

   c_signed_candidates nonmatches;
   merge_candidates(signed_candidates, nonmatches);
   
   print_permission_errors(nonmatches);

   print_errors(errors);

   return 0;
}
