#include "diff.h"
#include <cmpbuf.h>
#include <regex.h>
#include <setmode.h>
#include <xalloc.h>
#define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n)))
#define HASH(h, c) ((c) + ROL (h, 7))
typedef size_t hash_value;
verify (hash_value_is_unsigned, ! TYPE_SIGNED (hash_value));
struct equivclass
{
lin next;
hash_value hash;
char const *line;
size_t length;
};
static lin *buckets;
static size_t nbuckets;
static struct equivclass *equivs;
static lin equivs_index;
static lin equivs_alloc;
void
file_block_read (struct file_data *current, size_t size)
{
if (size && ! current->eof)
{
size_t s = block_read (current->desc,
FILE_BUFFER (current) + current->buffered, size);
if (s == SIZE_MAX)
pfatal_with_name (current->name);
current->buffered += s;
current->eof = s < size;
}
}
#define binary_file_p(buf, size) (memchr (buf, 0, size) != 0)
static bool
sip (struct file_data *current, bool skip_test)
{
if (current->desc < 0)
{
current->bufsize = sizeof (word);
current->buffer = xmalloc (current->bufsize);
}
else
{
current->bufsize = buffer_lcm (sizeof (word),
STAT_BLOCKSIZE (current->stat),
PTRDIFF_MAX - 2 * sizeof (word));
current->buffer = xmalloc (current->bufsize);
if (! skip_test)
{
bool was_binary = set_binary_mode (current->desc, 1);
off_t buffered;
file_block_read (current, current->bufsize);
buffered = current->buffered;
if (! was_binary)
{
if (lseek (current->desc, - buffered, SEEK_CUR) == -1)
pfatal_with_name (current->name);
set_binary_mode (current->desc, 0);
current->buffered = 0;
current->eof = 0;
}
return binary_file_p (current->buffer, buffered);
}
}
current->buffered = 0;
current->eof = 0;
return 0;
}
static void
slurp (struct file_data *current)
{
size_t cc;
if (current->desc < 0)
{
return;
}
if (S_ISREG (current->stat.st_mode))
{
size_t file_size = current->stat.st_size;
cc = file_size + 2 * sizeof (word) - file_size % sizeof (word);
if (file_size != current->stat.st_size || cc < file_size
|| PTRDIFF_MAX <= cc)
xalloc_die ();
if (current->bufsize < cc)
{
current->bufsize = cc;
current->buffer = xrealloc (current->buffer, cc);
}
if (current->buffered <= file_size)
{
file_block_read (current, file_size + 1 - current->buffered);
if (current->buffered <= file_size)
return;
}
}
file_block_read (current, current->bufsize - current->buffered);
if (current->buffered)
{
while (current->buffered == current->bufsize)
{
if (PTRDIFF_MAX / 2 - sizeof (word) < current->bufsize)
xalloc_die ();
current->bufsize *= 2;
current->buffer = xrealloc (current->buffer, current->bufsize);
file_block_read (current, current->bufsize - current->buffered);
}
cc = current->buffered + 2 * sizeof (word);
current->bufsize = cc - cc % sizeof (word);
current->buffer = xrealloc (current->buffer, current->bufsize);
}
}
static void
find_and_hash_each_line (struct file_data *current)
{
hash_value h;
unsigned char const *p = (unsigned char const *) current->prefix_end;
unsigned char c;
lin i, *bucket;
size_t length;
char const **linbuf = current->linbuf;
lin alloc_lines = current->alloc_lines;
lin line = 0;
lin linbuf_base = current->linbuf_base;
lin *cureqs = xmalloc (alloc_lines * sizeof *cureqs);
struct equivclass *eqs = equivs;
lin eqs_index = equivs_index;
lin eqs_alloc = equivs_alloc;
char const *suffix_begin = current->suffix_begin;
char const *bufend = FILE_BUFFER (current) + current->buffered;
bool diff_length_compare_anyway =
ignore_white_space != IGNORE_NO_WHITE_SPACE;
bool same_length_diff_contents_compare_anyway =
diff_length_compare_anyway | ignore_case;
while ((char const *) p < suffix_begin)
{
char const *ip = (char const *) p;
h = 0;
if (ignore_case)
switch (ignore_white_space)
{
case IGNORE_ALL_SPACE:
while ((c = *p++) != '\n')
if (! ISSPACE (c))
h = HASH (h, TOLOWER (c));
break;
case IGNORE_SPACE_CHANGE:
while ((c = *p++) != '\n')
{
if (ISSPACE (c))
{
do
if ((c = *p++) == '\n')
goto hashing_done;
while (ISSPACE (c));
h = HASH (h, ' ');
}
h = HASH (h, TOLOWER (c));
}
break;
case IGNORE_TAB_EXPANSION:
{
size_t column = 0;
while ((c = *p++) != '\n')
{
int repetitions = 1;
switch (c)
{
case '\b':
column -= 0 < column;
break;
case '\t':
c = ' ';
repetitions = TAB_WIDTH - column % TAB_WIDTH;
column += repetitions;
break;
case '\r':
column = 0;
break;
default:
c = TOLOWER (c);
column++;
break;
}
do
h = HASH (h, c);
while (--repetitions != 0);
}
}
break;
default:
while ((c = *p++) != '\n')
h = HASH (h, TOLOWER (c));
break;
}
else
switch (ignore_white_space)
{
case IGNORE_ALL_SPACE:
while ((c = *p++) != '\n')
if (! ISSPACE (c))
h = HASH (h, c);
break;
case IGNORE_SPACE_CHANGE:
while ((c = *p++) != '\n')
{
if (ISSPACE (c))
{
do
if ((c = *p++) == '\n')
goto hashing_done;
while (ISSPACE (c));
h = HASH (h, ' ');
}
h = HASH (h, c);
}
break;
case IGNORE_TAB_EXPANSION:
{
size_t column = 0;
while ((c = *p++) != '\n')
{
int repetitions = 1;
switch (c)
{
case '\b':
column -= 0 < column;
break;
case '\t':
c = ' ';
repetitions = TAB_WIDTH - column % TAB_WIDTH;
column += repetitions;
break;
case '\r':
column = 0;
break;
default:
column++;
break;
}
do
h = HASH (h, c);
while (--repetitions != 0);
}
}
break;
default:
while ((c = *p++) != '\n')
h = HASH (h, c);
break;
}
hashing_done:;
bucket = &buckets[h % nbuckets];
length = (char const *) p - ip - 1;
if ((char const *) p == bufend
&& current->missing_newline
&& ROBUST_OUTPUT_STYLE (output_style))
{
if (ignore_white_space < IGNORE_SPACE_CHANGE)
bucket = &buckets[-1];
p--;
bufend = suffix_begin = (char const *) p;
}
for (i = *bucket; ; i = eqs[i].next)
if (!i)
{
i = eqs_index++;
if (i == eqs_alloc)
{
if (PTRDIFF_MAX / (2 * sizeof *eqs) <= eqs_alloc)
xalloc_die ();
eqs_alloc *= 2;
eqs = xrealloc (eqs, eqs_alloc * sizeof *eqs);
}
eqs[i].next = *bucket;
eqs[i].hash = h;
eqs[i].line = ip;
eqs[i].length = length;
*bucket = i;
break;
}
else if (eqs[i].hash == h)
{
char const *eqline = eqs[i].line;
if (eqs[i].length == length)
{
if (memcmp (eqline, ip, length) == 0)
break;
if (!same_length_diff_contents_compare_anyway)
continue;
}
else if (!diff_length_compare_anyway)
continue;
if (! lines_differ (eqline, ip))
break;
}
if (line == alloc_lines)
{
if (PTRDIFF_MAX / 3 <= alloc_lines
|| PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
|| PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
xalloc_die ();
alloc_lines = 2 * alloc_lines - linbuf_base;
cureqs = xrealloc (cureqs, alloc_lines * sizeof *cureqs);
linbuf += linbuf_base;
linbuf = xrealloc (linbuf,
(alloc_lines - linbuf_base) * sizeof *linbuf);
linbuf -= linbuf_base;
}
linbuf[line] = ip;
cureqs[line] = i;
++line;
}
current->buffered_lines = line;
for (i = 0; ; i++)
{
if (line == alloc_lines)
{
if (PTRDIFF_MAX / 3 <= alloc_lines
|| PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
|| PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
xalloc_die ();
alloc_lines = 2 * alloc_lines - linbuf_base;
linbuf += linbuf_base;
linbuf = xrealloc (linbuf,
(alloc_lines - linbuf_base) * sizeof *linbuf);
linbuf -= linbuf_base;
}
linbuf[line] = (char const *) p;
if ((char const *) p == bufend)
break;
if (context <= i && no_diff_means_no_output)
break;
line++;
while (*p++ != '\n')
continue;
}
current->linbuf = linbuf;
current->valid_lines = line;
current->alloc_lines = alloc_lines;
current->equivs = cureqs;
equivs = eqs;
equivs_alloc = eqs_alloc;
equivs_index = eqs_index;
}
static void
prepare_text (struct file_data *current)
{
size_t buffered = current->buffered;
char *p = FILE_BUFFER (current);
char *dst;
if (buffered == 0 || p[buffered - 1] == '\n')
current->missing_newline = 0;
else
{
p[buffered++] = '\n';
current->missing_newline = 1;
}
if (!p)
return;
memset (p + buffered, 0, sizeof (word));
if (strip_trailing_cr && (dst = memchr (p, '\r', buffered)))
{
char const *src = dst;
char const *srclim = p + buffered;
do
dst += ! ((*dst = *src++) == '\r' && *src == '\n');
while (src < srclim);
buffered -= src - dst;
}
current->buffered = buffered;
}
static lin
guess_lines (lin n, size_t s, size_t t)
{
size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
lin guessed_lines = MAX (1, t / guessed_bytes_per_line);
return MIN (guessed_lines, PTRDIFF_MAX / (2 * sizeof (char *) + 1) - 5) + 5;
}
static void
find_identical_ends (struct file_data filevec[])
{
word *w0, *w1;
char *p0, *p1, *buffer0, *buffer1;
char const *end0, *beg0;
char const **linbuf0, **linbuf1;
lin i, lines;
size_t n0, n1;
lin alloc_lines0, alloc_lines1;
lin buffered_prefix, prefix_count, prefix_mask;
lin middle_guess, suffix_guess;
slurp (&filevec[0]);
prepare_text (&filevec[0]);
if (filevec[0].desc != filevec[1].desc)
{
slurp (&filevec[1]);
prepare_text (&filevec[1]);
}
else
{
filevec[1].buffer = filevec[0].buffer;
filevec[1].bufsize = filevec[0].bufsize;
filevec[1].buffered = filevec[0].buffered;
filevec[1].missing_newline = filevec[0].missing_newline;
}
w0 = filevec[0].buffer;
w1 = filevec[1].buffer;
p0 = buffer0 = (char *) w0;
p1 = buffer1 = (char *) w1;
n0 = filevec[0].buffered;
n1 = filevec[1].buffered;
if (p0 == p1)
p0 = p1 += n1;
else
{
if (n0 < n1)
p0[n0] = ~p1[n0];
else
p1[n1] = ~p0[n1];
while (*w0 == *w1)
w0++, w1++;
p0 = (char *) w0;
p1 = (char *) w1;
while (*p0 == *p1)
p0++, p1++;
if (ROBUST_OUTPUT_STYLE (output_style)
&& ((buffer0 + n0 - filevec[0].missing_newline < p0)
!=
(buffer1 + n1 - filevec[1].missing_newline < p1)))
p0--, p1--;
}
i = horizon_lines;
while (p0 != buffer0 && (p0[-1] != '\n' || i--))
p0--, p1--;
filevec[0].prefix_end = p0;
filevec[1].prefix_end = p1;
p0 = buffer0 + n0;
p1 = buffer1 + n1;
if (! ROBUST_OUTPUT_STYLE (output_style)
|| filevec[0].missing_newline == filevec[1].missing_newline)
{
end0 = p0;
beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
for (; p0 != beg0; p0--, p1--)
if (*p0 != *p1)
{
beg0 = p0;
break;
}
i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
&&
(buffer1 == p1 || p1[-1] == '\n'));
while (i-- && p0 != end0)
while (*p0++ != '\n')
continue;
p1 += p0 - beg0;
}
filevec[0].suffix_begin = p0;
filevec[1].suffix_begin = p1;
if (no_diff_means_no_output && ! function_regexp.fastmap
&& context < LIN_MAX / 4 && context < n0)
{
middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end);
suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0);
for (prefix_count = 1; prefix_count <= context; prefix_count *= 2)
continue;
alloc_lines0 = (prefix_count + middle_guess
+ MIN (context, suffix_guess));
}
else
{
prefix_count = 0;
alloc_lines0 = guess_lines (0, 0, n0);
}
prefix_mask = prefix_count - 1;
lines = 0;
linbuf0 = xmalloc (alloc_lines0 * sizeof *linbuf0);
p0 = buffer0;
if (! (no_diff_means_no_output
&& filevec[0].prefix_end == p0
&& filevec[1].prefix_end == p1))
{
end0 = filevec[0].prefix_end;
while (p0 != end0)
{
lin l = lines++ & prefix_mask;
if (l == alloc_lines0)
{
if (PTRDIFF_MAX / (2 * sizeof *linbuf0) <= alloc_lines0)
xalloc_die ();
alloc_lines0 *= 2;
linbuf0 = xrealloc (linbuf0, alloc_lines0 * sizeof *linbuf0);
}
linbuf0[l] = p0;
while (*p0++ != '\n')
continue;
}
}
buffered_prefix = prefix_count && context < lines ? context : lines;
middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end);
suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1);
alloc_lines1 = buffered_prefix + middle_guess + MIN (context, suffix_guess);
if (alloc_lines1 < buffered_prefix
|| PTRDIFF_MAX / sizeof *linbuf1 <= alloc_lines1)
xalloc_die ();
linbuf1 = xmalloc (alloc_lines1 * sizeof *linbuf1);
if (buffered_prefix != lines)
{
for (i = 0; i < buffered_prefix; i++)
linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
for (i = 0; i < buffered_prefix; i++)
linbuf0[i] = linbuf1[i];
}
for (i = 0; i < buffered_prefix; i++)
linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
filevec[0].linbuf = linbuf0 + buffered_prefix;
filevec[1].linbuf = linbuf1 + buffered_prefix;
filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
}
static unsigned char const prime_offset[] =
{
0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
55, 93, 1, 57, 25
};
verify (enough_prime_offsets,
sizeof (size_t) * CHAR_BIT <= sizeof prime_offset);
bool
read_files (struct file_data filevec[], bool pretend_binary)
{
int i;
bool skip_test = text | pretend_binary;
bool appears_binary = pretend_binary | sip (&filevec[0], skip_test);
if (filevec[0].desc != filevec[1].desc)
appears_binary |= sip (&filevec[1], skip_test | appears_binary);
else
{
filevec[1].buffer = filevec[0].buffer;
filevec[1].bufsize = filevec[0].bufsize;
filevec[1].buffered = filevec[0].buffered;
}
if (appears_binary)
{
set_binary_mode (filevec[0].desc, 1);
set_binary_mode (filevec[1].desc, 1);
return 1;
}
find_identical_ends (filevec);
equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
if (PTRDIFF_MAX / sizeof *equivs <= equivs_alloc)
xalloc_die ();
equivs = xmalloc (equivs_alloc * sizeof *equivs);
equivs_index = 1;
for (i = 9; (size_t) 1 << i < equivs_alloc / 3; i++)
continue;
nbuckets = ((size_t) 1 << i) - prime_offset[i];
if (PTRDIFF_MAX / sizeof *buckets <= nbuckets)
xalloc_die ();
buckets = zalloc ((nbuckets + 1) * sizeof *buckets);
buckets++;
for (i = 0; i < 2; i++)
find_and_hash_each_line (&filevec[i]);
filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
free (equivs);
free (buckets - 1);
return 0;
}