#!/usr/bin/env perl

use strict;
use warnings;
use File::Temp qw/ tempfile /;
use Fcntl ':flock';

open my $self, '<', $0 or die "Couldn't open self: $!";
flock $self, LOCK_EX | LOCK_NB or die "This script is already running! Aborting!";

#EXPLAINER:
# Compare a Git repository (source of truth) to a deployed directory.
#
# Backstory:
# This was created because it was common for folk to add hotfixes in production directly
# which then got overwritten with git deployments causing production incidents
# this script was a first stage necessity in order to clean up, before being able to enforce
# a strict deployment via git only rule.
#
# Recursively checks for:
#   - Files that exist in one location but not the other
#   - Files with size differences
#
# Useful for:
#   - Verifying that a deployment matches the committed source
#   - Catching out-of-band edits on production servers
#   - Validating integrity of scripts, configs, and assets
#
# Originally written in bash, but it got too unreadable for the job.
# Rewritten in Perl for better sanity and maintainability.
#
# !! Not intended for projects with compiled binaries (hash mismatches are expected)
#USAGE perl <THIS SCRIPT> <GIT REPO PATH> <DEPLOYMENT PATH>

die "Usage: $0 DIRECTORY1 DIRECTORY2\n" if scalar(@ARGV) < 2;

my $git_locations = $ARGV[0];
my $script_locations = $ARGV[1];

my @git_found_files;
my @scripts_found_files;
my $file_diffs="./file_diffs.txt";
my $file_list="./file_list.txt";
my $changed_files="./changed_files.txt";
my %processed;

my $stem1 = $git_locations;
my $stem2 = $script_locations;

my ($git, $git_files) = tempfile('/tmp/gitrepo-git-consistency-check-XXXXXXXXXXXX', SUFFIX => '.tmp', UNLINK => 0);
my ($scripts, $scripts_files) = tempfile('/tmp/scripts-git-consistency-check-XXXXXXXXXXXX', SUFFIX => '.tmp', UNLINK => 0);

#recursively search through target directory for all files
sub find_files {
    my $path = $_[0];
    my $output = $_[1];
    my @found_files = $_[2];

    $path .= '/' if($path !~ /\/$/);

    for my $file (glob($path . '*')) {
        if(-d $file) {
            find_files($file, $output, @found_files);
        } else {
            print $output $file."\n";
        }
    }
}

#take two lists of files, remove the stems of the path, and compare differences between the lists
sub list_non_present_files {
    my $file_diffs = $_[2];
    my $stem1 = $_[3];
    my $stem2 = $_[4];
    my %fl;
    my %df;

    open (my $file2,"<",$_[1]) or die "Cannot open file ".$_[1]." for reading: $!";
    my %seen;
    while (my $line1 = <$file2>) {
        chomp ($line1);
        $line1 =~ s/\Q$stem1\E//;
        $line1 =~ s/\Q$stem2\E//;
        $seen{$line1}++;
    }

    close ($file2) or die "Could not finish reading from ".$_[1].": $!";

    my $match_name = $_[0] =~ /scripts|gitrepo/p;
    my $source_name = ${^MATCH};

    open (my $file1,"<",$_[0]) or die $!;
    while (my $line2 = <$file1>) {
        chomp $line2;
        $line2 =~ s/\Q$stem1\E//;
        $line2 =~ s/\Q$stem2\E//;
        if($seen{$line2}) {
            $fl{$line2}++;
        } else {
            $df{$line2}++ unless $line2 eq "";
        }
    }

    close ($file1) or die "Could not finish reading from ".$_[0].": $!";

    open(my ($diffs), '>>', $file_diffs) or die "Cannot open file ".$file_diffs." for writing: $!";
    print $diffs "Only in ".$source_name." folder: \n";
    print $diffs "$_\n" for keys %df;
    print $diffs "\n";
    close ($diffs) or die "Could not finish writing to ".$file_diffs.": $!";

    open(my ($flist), '>>', $file_list) or die "Cannot open file ".$file_list." for writing: $!";
    print $flist "$_\n" for keys %fl;
    close ($flist) or die "Could not finish writing to ".$file_list.": $!";

}

sub compare_files {
    my $file1 = $_[0];
    my $file2 = $_[1];

    my $filesize1 = -s $file1;
    my $filesize2 = -s $file2;

    #skip anything that isn't a normal file
    return "" unless -f $file1;
    return "" unless -f $file2;

    #TODO: this might not be foolproof in some cases where one file is bigger, but still has less lines
    if($filesize2 > $filesize1) {
        my $filetemp = $file2;
        $file2 = $file1;
        $file1 = $filetemp;
    }

    my $fname = $file1;
    $fname =~ s/^\///;
    $fname =~ s/\//-/g;

    my ($co, $current_output) = tempfile('./'.$fname.'XXXXXXXX', SUFFIX => '.tmp', UNLINK => 1);

    open(my $in1,"<",$file1) or die "Cannot open file ".$file1." for reading: $!";
    open(my $in2,"<",$file2) or die "Cannot open file ".$file2." for reading: $!";
    open($co,">>",$current_output) or die "Cannot open file ".$current_output." for writing: $!";

    my $lineno = 1;
    my $is_mismatch = 0;

    #TODO: if a line exists in file1, but is blank line, and line doesn't exist in file2 because it's shorter, the diff is not picked up
    while (my $line1 = <$in1>) {
        my $line2 = <$in2>;
        $line2 = "\n" unless defined $line2;
        if ($line1 eq $line2) {
            ++$lineno;
            next;
        }
        if($is_mismatch<1) {
            print $co "Mismatch between files: \n".$file1."\n".$file2."\nPlease check:\n";
        }
        print $co "line :".$lineno."\n";
        print $co "$line1";
        print $co "$line2";
        $is_mismatch=1;
        ++$lineno;
    }

    if ($is_mismatch == 1 && !exists $processed{$fname}) {
        system "gzip $current_output";
        $processed{$fname}++;
    }

    close $co or die "Cannot close file: ".$current_output.": $!";
    close $in1 or die "Cannot close file: ".$file1.": $!";
    close $in2 or die "Cannot close file: ".$file2.": $!";

    return $file1 if $is_mismatch == 1;
    return "";
}

sub read_list {
    my $list = $_[0];
    my $stem1 = $_[1];
    my $stem2 = $_[2];
    my $changed_files = $_[3];
    my %changed_file_list;
    my $current_file;

    open(my $ll,'<',$list) or die "Cannot open file ".$list." for reading: $!";
    while (my $line = <$ll>) {
        chomp $line;
        if ($line =~ /\.jar$|\.gz$/) {
            next;
        }
        $current_file = compare_files($stem1.$line, $stem2.$line);
        if ($current_file ne "") {
            $current_file =~ s/\Q$stem1\E//;
            $current_file =~ s/\Q$stem2\E//;
            $changed_file_list{$current_file}++;
        }
    }

    close $ll or die "Cannot close file: ".$list.": $!";

    open(my ($chflist), '>>', $changed_files) or die "Cannot open file ".$changed_files." for writing: $!";
    print $chflist "The following files have differences between git repo and deployment: \n";
    print $chflist "$_\n" for keys %changed_file_list;
    print $chflist "\n";
    close ($chflist) or die "Cannot close file: ".$changed_files.": $!";
}

#locate all the files in the git repo and in the deployment directory
find_files($git_locations, $git, @git_found_files);
find_files($script_locations, $scripts, @scripts_found_files);

#force close these files if they're not closed, otherwise you hit some weird buffering problem
if($git->opened() == 1) {
    close $git or die "Cannot close file: $!";
}

if($scripts->opened() == 1) {
    close $scripts or die "Cannot close file: $!";
}

#Figure out which files exist only in one or the other dir structure, and print the list of differences
list_non_present_files($scripts_files, $git_files, $file_diffs, $stem1, $stem2);
list_non_present_files($git_files, $scripts_files, $file_diffs, $stem1, $stem2);

#then for the files that exist in BOTH directory structures alike, analyze the differences between all the files
read_list($file_list, $stem1, $stem2, $changed_files);


