detect-renamed.diff [plain text]
This patch adds the --detect-renamed option which makes rsync notice files
that either (1) match in size & modify-time (plus the basename, if possible)
or (2) match in size & checksum (when --checksum was also specified) and use
each match as an alternate basis file to speed up the transfer.
The algorithm attempts to scan the receiving-side's files in an efficient
manner. If --delete[-before] is enabled, we'll take advantage of the
pre-transfer delete pass to prepare any alternate-basis-file matches we
might find. If --delete-before is not enabled, rsync does the rename scan
during the regular file-sending scan (scanning each directory right before
the generator starts updating files from that dir). In this latter mode,
rsync might delay the updating of a file (if no alternate-basis match was
yet found) until the full scan of the receiving side is complete, at which
point any delayed files are processed.
I chose to hard-link the alternate-basis files into a ".~tmp~" subdir that
takes advantage of rsync's pre-existing partial-dir logic. This uses less
memory than trying to keep track of the matches internally, and also allows
any deletions or file-updates to occur normally without interfering with
these alternate-basis discoveries.
To use this patch, run these commands for a successful build:
patch -p1 <patches/detect-renamed.diff
./configure (optional if already run)
make
TODO:
We need to never return a match from fattr_find() that has a basis
file. This will ensure that we don't try to give a renamed file to
a file that can't use it, while missing out on giving it to a file
that could use it.
--- old/flist.c
+++ new/flist.c
@@ -51,6 +51,7 @@ extern int implied_dirs;
extern int prune_empty_dirs;
extern int copy_links;
extern int copy_unsafe_links;
+extern int detect_renamed;
extern int protocol_version;
extern int sanitize_paths;
extern struct stats stats;
@@ -68,6 +69,8 @@ int checksum_len;
dev_t filesystem_dev; /* used to implement -x */
unsigned int file_struct_len;
+struct file_list the_fattr_list;
+
static char empty_sum[MD4_SUM_LENGTH];
static int flist_count_offset;
@@ -250,6 +253,44 @@ static mode_t from_wire_mode(int mode)
return mode;
}
+static int fattr_compare(struct file_struct **file1, struct file_struct **file2)
+{
+ struct file_struct *f1 = *file1;
+ struct file_struct *f2 = *file2;
+ int diff;
+
+ if (!f1->basename || !S_ISREG(f1->mode) || !f1->length) {
+ if (!f2->basename || !S_ISREG(f2->mode) || !f2->length)
+ return 0;
+ return 1;
+ }
+ if (!f2->basename || !S_ISREG(f2->mode) || !f2->length)
+ return -1;
+
+ /* Don't use diff for values that are longer than an int. */
+ if (f1->length != f2->length)
+ return f1->length < f2->length ? -1 : 1;
+
+ if (always_checksum) {
+ diff = u_memcmp(f1->u.sum, f2->u.sum, checksum_len);
+ if (diff)
+ return diff;
+ } else if (f1->modtime != f2->modtime)
+ return f1->modtime < f2->modtime ? -1 : 1;
+
+ diff = u_strcmp(f1->basename, f2->basename);
+ if (diff)
+ return diff;
+
+ if (f1->dirname == f2->dirname)
+ return 0;
+ if (!f1->dirname)
+ return -1;
+ if (!f2->dirname)
+ return 1;
+ return u_strcmp(f1->dirname, f2->dirname);
+}
+
static void send_directory(int f, struct file_list *flist,
char *fbuf, int len);
@@ -1379,6 +1420,25 @@ struct file_list *recv_file_list(int f)
clean_flist(flist, relative_paths, 1);
+ if (detect_renamed) {
+ int j = flist->count;
+ the_fattr_list.count = j;
+ the_fattr_list.files = new_array(struct file_struct *, j);
+ if (!the_fattr_list.files)
+ goto oom;
+ memcpy(the_fattr_list.files, flist->files,
+ j * sizeof (struct file_struct *));
+ qsort(the_fattr_list.files, j,
+ sizeof the_fattr_list.files[0], (int (*)())fattr_compare);
+ the_fattr_list.low = 0;
+ while (j-- > 0) {
+ struct file_struct *fp = the_fattr_list.files[j];
+ if (fp->basename && S_ISREG(fp->mode) && fp->length)
+ break;
+ }
+ the_fattr_list.high = j;
+ }
+
if (f >= 0) {
recv_uid_list(f, flist);
--- old/generator.c
+++ new/generator.c
@@ -76,6 +76,7 @@ extern char *basis_dir[];
extern int compare_dest;
extern int copy_dest;
extern int link_dest;
+extern int detect_renamed;
extern int whole_file;
extern int list_only;
extern int new_root_dir;
@@ -91,12 +92,15 @@ extern char *backup_dir;
extern char *backup_suffix;
extern int backup_suffix_len;
extern struct file_list *the_file_list;
+extern struct file_list the_fattr_list;
extern struct filter_list_struct server_filter_list;
static int deletion_count = 0; /* used to implement --max-delete */
+static int unexplored_dirs = 1;
-/* For calling delete_file() */
+/* For calling delete_item() and delete_in_dir() */
#define DEL_FORCE_RECURSE (1<<1) /* recurse even w/o --force */
+#define DEL_NO_DELETIONS (1<<2)
#define DEL_TERSE (1<<3)
@@ -106,12 +110,120 @@ static int is_backup_file(char *fn)
return k > 0 && strcmp(fn+k, backup_suffix) == 0;
}
+/* Search for a regular file that matches either (1) the size & modified
+ * time (plus the basename, if possible) or (2) the size & checksum. If
+ * we find an exact match down to the dirname, return -1 because we found
+ * an up-to-date file in the transfer, not a renamed file. */
+static int fattr_find(struct file_struct *f, char *fname, alloc_pool_t pool)
+{
+ int low = the_fattr_list.low, high = the_fattr_list.high;
+ int mid, ok_match = -1, good_match = -1;
+ struct file_struct *fmid;
+ int diff;
+
+ while (low <= high) {
+ mid = (low + high) / 2;
+ fmid = the_fattr_list.files[mid];
+ if (fmid->length != f->length) {
+ if (fmid->length < f->length)
+ low = mid + 1;
+ else
+ high = mid - 1;
+ continue;
+ }
+ if (always_checksum) {
+ if (!f->u.sum) {
+ if (fmid->modtime == f->modtime
+ && f_name_cmp(fmid, f) == 0)
+ return -1; /* assume we can't help */
+ f->u.sum = pool_alloc(pool, MD4_SUM_LENGTH,
+ "fattr_find");
+ file_checksum(fname, f->u.sum, f->length);
+ }
+ diff = u_memcmp(fmid->u.sum, f->u.sum, checksum_len);
+ if (diff) {
+ if (diff < 0)
+ low = mid + 1;
+ else
+ high = mid - 1;
+ continue;
+ }
+ } else {
+ if (fmid->modtime != f->modtime) {
+ if (fmid->modtime < f->modtime)
+ low = mid + 1;
+ else
+ high = mid - 1;
+ continue;
+ }
+ }
+ ok_match = mid;
+ diff = u_strcmp(fmid->basename, f->basename);
+ if (diff == 0) {
+ good_match = mid;
+ if (fmid->dirname == f->dirname)
+ return -1; /* file is up-to-date */
+ if (!fmid->dirname) {
+ low = mid + 1;
+ continue;
+ }
+ if (!f->dirname) {
+ high = mid - 1;
+ continue;
+ }
+ diff = u_strcmp(fmid->dirname, f->dirname);
+ if (diff == 0)
+ return -1; /* file is up-to-date */
+ }
+ if (diff < 0)
+ low = mid + 1;
+ else
+ high = mid - 1;
+ }
+
+ return good_match >= 0 ? good_match : ok_match;
+}
+
+static void look_for_rename(struct file_struct *file, char *fname,
+ alloc_pool_t pool)
+{
+ struct file_struct *fp;
+ char *partialptr, *fn;
+ STRUCT_STAT st;
+ int ndx;
+
+ if ((ndx = fattr_find(file, fname, pool)) < 0)
+ return;
+
+ fp = the_fattr_list.files[ndx];
+ fn = f_name(fp, NULL);
+ /* We don't provide an alternate-basis file if there is a basis file. */
+ if (link_stat(fn, &st, 0) == 0)
+ return;
+ if ((partialptr = partial_dir_fname(fn)) == NULL
+ || !handle_partial_dir(partialptr, PDIR_CREATE))
+ return;
+
+ /* We only use the file if we can hard-link it into our tmp dir. */
+ if (link(fname, partialptr) == 0) {
+ if (verbose > 2) {
+ rprintf(FINFO, "found renamed: %s => %s\n",
+ fname, partialptr);
+ }
+ return;
+ }
+
+ if (errno != EEXIST)
+ handle_partial_dir(partialptr, PDIR_DELETE);
+}
/* Delete a file or directory. If DEL_FORCE_RECURSE is set in the flags, or if
* force_delete is set, this will delete recursively.
*
* Note that fname must point to a MAXPATHLEN buffer if the mode indicates it's
* a directory! (The buffer is used for recursion, but returned unchanged.)
+ *
+ * Also Note: --detect-rename may use this routine with DEL_NO_DELETIONS set!
*/
static int delete_item(char *fname, int mode, int flags)
{
@@ -122,6 +234,8 @@ static int delete_item(char *fname, int
char *p;
if (!S_ISDIR(mode)) {
+ if (flags & DEL_NO_DELETIONS)
+ return 0;
if (max_delete && ++deletion_count > max_delete)
return 0;
if (make_backups && (backup_dir || !is_backup_file(fname)))
@@ -144,6 +258,7 @@ static int delete_item(char *fname, int
zap_dir = flags & DEL_FORCE_RECURSE || force_delete;
if ((max_delete && ++deletion_count > max_delete)
+ || flags & DEL_NO_DELETIONS
|| (dry_run && zap_dir)) {
ok = 0;
errno = ENOTEMPTY;
@@ -186,6 +301,8 @@ static int delete_item(char *fname, int
continue;
strlcpy(p, fp->basename, remainder);
+ if (detect_renamed && S_ISREG(fp->mode))
+ look_for_rename(fp, fname, dirlist->file_pool);
delete_item(fname, fp->mode, flags & ~DEL_TERSE);
}
flist_free(dirlist);
@@ -194,7 +311,8 @@ static int delete_item(char *fname, int
pop_local_filters(save_filters);
- if (max_delete && ++deletion_count > max_delete)
+ if (flags & DEL_NO_DELETIONS
+ || (max_delete && ++deletion_count > max_delete))
return 0;
if (do_rmdir(fname) == 0) {
@@ -214,15 +332,19 @@ static int delete_item(char *fname, int
* all the --delete-WHEN options. Note that the fbuf pointer must point to a
* MAXPATHLEN buffer with the name of the directory in it (the functions we
* call will append names onto the end, but the old dir value will be restored
- * on exit). */
+ * on exit).
+ *
+ * Note: --detect-rename may use this routine with DEL_NO_DELETIONS set!
+ */
static void delete_in_dir(struct file_list *flist, char *fbuf,
- struct file_struct *file, STRUCT_STAT *stp)
+ struct file_struct *file, STRUCT_STAT *stp, int flags)
{
static int min_depth = MAXPATHLEN, cur_depth = -1;
static void *filt_array[MAXPATHLEN/2+1];
static int already_warned = 0;
struct file_list *dirlist;
- char delbuf[MAXPATHLEN];
+ char *p, delbuf[MAXPATHLEN];
+ unsigned remainder;
int dlen, i;
if (!flist) {
@@ -236,6 +358,8 @@ static void delete_in_dir(struct file_li
if (verbose > 2)
rprintf(FINFO, "delete_in_dir(%s)\n", fbuf);
+ flags |= DEL_FORCE_RECURSE;
+
if (allowed_lull)
maybe_send_keepalive();
@@ -243,12 +367,14 @@ static void delete_in_dir(struct file_li
return; /* Impossible... */
if (io_error && !(lp_ignore_errors(module_id) || ignore_errors)) {
- if (already_warned)
+ if (!already_warned) {
+ rprintf(FINFO,
+ "IO error encountered -- skipping file deletion\n");
+ already_warned = 1;
+ }
+ if (!detect_renamed)
return;
- rprintf(FINFO,
- "IO error encountered -- skipping file deletion\n");
- already_warned = 1;
- return;
+ flags |= DEL_NO_DELETIONS;
}
while (cur_depth >= file->dir.depth && cur_depth >= min_depth)
@@ -259,6 +385,9 @@ static void delete_in_dir(struct file_li
dlen = strlen(fbuf);
filt_array[cur_depth] = push_local_filters(fbuf, dlen);
+ if (detect_renamed)
+ unexplored_dirs--;
+
if (one_file_system) {
if (file->flags & FLAG_TOP_DIR)
filesystem_dev = stp->st_dev;
@@ -268,18 +397,30 @@ static void delete_in_dir(struct file_li
dirlist = get_dirlist(fbuf, dlen, 0);
+ p = fbuf + dlen;
+ if (dlen != 1 || *fbuf != '/')
+ *p++ = '/';
+ remainder = MAXPATHLEN - (p - fbuf);
+
/* If an item in dirlist is not found in flist, delete it
* from the filesystem. */
for (i = dirlist->count; i--; ) {
struct file_struct *fp = dirlist->files[i];
if (!fp->basename || fp->flags & FLAG_MOUNT_POINT)
continue;
+ if (detect_renamed && S_ISREG(fp->mode)) {
+ strlcpy(p, fp->basename, remainder);
+ look_for_rename(fp, fbuf, dirlist->file_pool);
+ }
if (flist_find(flist, fp) < 0) {
f_name(fp, delbuf);
- delete_item(delbuf, fp->mode, DEL_FORCE_RECURSE);
- }
+ delete_item(delbuf, fp->mode, flags);
+ } else if (detect_renamed && S_ISDIR(fp->mode))
+ unexplored_dirs++;
}
+ fbuf[dlen] = '\0';
+
flist_free(dirlist);
}
@@ -309,9 +450,9 @@ static void do_delete_pass(struct file_l
|| !S_ISDIR(st.st_mode))
continue;
- delete_in_dir(flist, fbuf, file, &st);
+ delete_in_dir(flist, fbuf, file, &st, 0);
}
- delete_in_dir(NULL, NULL, NULL, NULL);
+ delete_in_dir(NULL, NULL, NULL, NULL, 0);
if (do_progress && !am_server)
rprintf(FINFO, " \r");
@@ -771,6 +912,7 @@ static int try_dests_non(struct file_str
return -1;
}
+static struct bitbag *delayed_bits = NULL;
static int phase = 0;
/* Acts on the_file_list->file's ndx'th item, whose name is fname. If a dir,
@@ -944,8 +1086,12 @@ static void recv_generator(char *fname,
&& verbose && code != FNONE && f_out != -1)
rprintf(code, "%s/\n", fname);
if (delete_during && f_out != -1 && !phase && dry_run < 2
- && (file->flags & FLAG_DEL_HERE))
- delete_in_dir(the_file_list, fname, file, &st);
+ && (file->flags & FLAG_DEL_HERE)) {
+ if (detect_renamed && statret != 0)
+ unexplored_dirs++;
+ delete_in_dir(the_file_list, fname, file, &st,
+ delete_during < 0 ? DEL_NO_DELETIONS : 0);
+ }
return;
}
@@ -1201,8 +1347,14 @@ static void recv_generator(char *fname,
&& hard_link_check(file, ndx, fname, statret, &st,
itemizing, code, HL_SKIP))
return;
- if (stat_errno == ENOENT)
+ if (stat_errno == ENOENT) {
+ if (detect_renamed && unexplored_dirs > 0
+ && file->length) {
+ bitbag_set_bit(delayed_bits, ndx);
+ return;
+ }
goto notify_others;
+ }
rsyserr(FERROR, stat_errno, "recv_generator: failed to stat %s",
full_fname(fname));
return;
@@ -1388,11 +1540,17 @@ void generate_files(int f_out, struct fi
(long)getpid(), flist->count);
}
+ if (detect_renamed) {
+ delayed_bits = bitbag_create(flist->count);
+ if (!delete_before && !delete_during)
+ delete_during = -1;
+ }
+
if (delete_before && !local_name && flist->count > 0)
do_delete_pass(flist);
do_progress = 0;
- if (append_mode || whole_file < 0)
+ if (append_mode || detect_renamed || whole_file < 0)
whole_file = 0;
if (verbose >= 2) {
rprintf(FINFO, "delta-transmission %s\n",
@@ -1447,7 +1605,23 @@ void generate_files(int f_out, struct fi
}
recv_generator(NULL, NULL, 0, 0, 0, code, -1);
if (delete_during)
- delete_in_dir(NULL, NULL, NULL, NULL);
+ delete_in_dir(NULL, NULL, NULL, NULL, 0);
+
+ if (detect_renamed) {
+ if (delete_during < 0)
+ delete_during = 0;
+ detect_renamed = 0;
+
+ for (i = -1; (i = bitbag_next_bit(delayed_bits, i)) >= 0; ) {
+ struct file_struct *file = flist->files[i];
+ if (local_name)
+ strlcpy(fbuf, local_name, sizeof fbuf);
+ else
+ f_name(file, fbuf);
+ recv_generator(fbuf, file, i, itemizing,
+ maybe_ATTRS_REPORT, code, f_out);
+ }
+ }
phase++;
csum_length = SUM_LENGTH;
--- old/options.c
+++ new/options.c
@@ -78,6 +78,7 @@ int am_generator = 0;
int am_starting_up = 1;
int relative_paths = -1;
int implied_dirs = 1;
+int detect_renamed = 0;
int numeric_ids = 0;
int allow_8bit_chars = 0;
int force_delete = 0;
@@ -346,6 +347,7 @@ void usage(enum logcode F)
rprintf(F," --modify-window=NUM compare mod-times with reduced accuracy\n");
rprintf(F," -T, --temp-dir=DIR create temporary files in directory DIR\n");
rprintf(F," -y, --fuzzy find similar file for basis if no dest file\n");
+ rprintf(F," --detect-renamed try to find renamed files to speed up the transfer\n");
rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n");
rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n");
@@ -499,6 +501,7 @@ static struct poptOption long_options[]
{"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
{"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
{"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
+ {"detect-renamed", 0, POPT_ARG_NONE, &detect_renamed, 0, 0, 0 },
{"fuzzy", 'y', POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 },
{"compress", 'z', POPT_ARG_NONE, 0, 'z', 0, 0 },
{"compress-level", 0, POPT_ARG_INT, &def_compress_level, 'z', 0, 0 },
@@ -1362,7 +1365,7 @@ int parse_arguments(int *argc, const cha
inplace = 1;
}
- if (delay_updates && !partial_dir)
+ if ((delay_updates || detect_renamed) && !partial_dir)
partial_dir = tmp_partialdir;
if (inplace) {
@@ -1371,6 +1374,7 @@ int parse_arguments(int *argc, const cha
snprintf(err_buf, sizeof err_buf,
"--%s cannot be used with --%s\n",
append_mode ? "append" : "inplace",
+ detect_renamed ? "detect-renamed" :
delay_updates ? "delay-updates" : "partial-dir");
return 0;
}
@@ -1674,6 +1678,8 @@ void server_options(char **args,int *arg
args[ac++] = "--super";
if (size_only)
args[ac++] = "--size-only";
+ if (detect_renamed)
+ args[ac++] = "--detect-renamed";
}
if (modify_window_set) {
--- old/rsync.yo
+++ new/rsync.yo
@@ -363,6 +363,7 @@ to the detailed description below for a
--modify-window=NUM compare mod-times with reduced accuracy
-T, --temp-dir=DIR create temporary files in directory DIR
-y, --fuzzy find similar file for basis if no dest file
+ --detect-renamed try to find renamed files to speed the xfer
--compare-dest=DIR also compare received files relative to DIR
--copy-dest=DIR ... and include copies of unchanged files
--link-dest=DIR hardlink to files in DIR when unchanged
@@ -1259,6 +1260,15 @@ Note that the use of the bf(--delete) op
fuzzy-match files, so either use bf(--delete-after) or specify some
filename exclusions if you need to prevent this.
+dit(bf(--detect-renamed)) This option tells rsync to scan the receiving
+side for files that have been renamed, and to use any that are found as
+alternate basis files to help speed up the transfer.
+By default, alternate-basis files are hard-linked into a directory named
+".~tmp~" in each file's destination directory, but if you've specified
+the bf(--partial-dir) option, that directory will be used instead. These
+potential alternate-basis files will be removed as the transfer progresses.
+This option conflicts with bf(--inplace) and bf(--append).
+
dit(bf(--compare-dest=DIR)) This option instructs rsync to use em(DIR) on
the destination machine as an additional hierarchy to compare destination
files against doing transfers (if the files are missing in the destination
--- old/util.c
+++ new/util.c
@@ -1025,6 +1025,32 @@ int handle_partial_dir(const char *fname
return 1;
}
+/* We need to supply our own strcmp function for file list comparisons
+ * to ensure that signed/unsigned usage is consistent between machines. */
+int u_strcmp(const char *p1, const char *p2)
+{
+ for ( ; *p1; p1++, p2++) {
+ if (*p1 != *p2)
+ break;
+ }
+
+ return (int)*(uchar*)p1 - (int)*(uchar*)p2;
+}
+
+/* We need a memcmp function compares unsigned-byte values. */
+int u_memcmp(const void *p1, const void *p2, size_t len)
+{
+ const uchar *u1 = p1;
+ const uchar *u2 = p2;
+
+ while (len--) {
+ if (*u1 != *u2)
+ return (int)*u1 - (int)*u2;
+ }
+
+ return 0;
+}
+
/**
* Determine if a symlink points outside the current directory tree.
* This is considered "unsafe" because e.g. when mirroring somebody
--- old/proto.h
+++ new/proto.h
@@ -304,6 +304,8 @@ int pop_dir(char *dir);
char *full_fname(const char *fn);
char *partial_dir_fname(const char *fname);
int handle_partial_dir(const char *fname, int create);
+int u_strcmp(const char *p1, const char *p2);
+int u_memcmp(const void *p1, const void *p2, size_t len);
int unsafe_symlink(const char *dest, const char *src);
char *human_num(int64 num);
char *human_dnum(double dnum, int decimal_digits);
--- old/rsync.1
+++ new/rsync.1
@@ -429,6 +429,7 @@ to the detailed description below for a
\-\-modify\-window=NUM compare mod-times with reduced accuracy
\-T, \-\-temp\-dir=DIR create temporary files in directory DIR
\-y, \-\-fuzzy find similar file for basis if no dest file
+ \-\-detect\-renamed try to find renamed files to speed the xfer
\-\-compare\-dest=DIR also compare received files relative to DIR
\-\-copy\-dest=DIR \&.\&.\&. and include copies of unchanged files
\-\-link\-dest=DIR hardlink to files in DIR when unchanged
@@ -1458,6 +1459,16 @@ Note that the use of the \fB\-\-delete\f
fuzzy-match files, so either use \fB\-\-delete\-after\fP or specify some
filename exclusions if you need to prevent this\&.
.IP
+.IP "\fB\-\-detect\-renamed\fP"
+This option tells rsync to scan the receiving
+side for files that have been renamed, and to use any that are found as
+alternate basis files to help speed up the transfer\&.
+By default, alternate-basis files are hard-linked into a directory named
+"\&.~tmp~" in each file\&'s destination directory, but if you\&'ve specified
+the \fB\-\-partial\-dir\fP option, that directory will be used instead\&. These
+potential alternate-basis files will be removed as the transfer progresses\&.
+This option conflicts with \fB\-\-inplace\fP and \fB\-\-append\fP\&.
+.IP
.IP "\fB\-\-compare\-dest=DIR\fP"
This option instructs rsync to use \fIDIR\fP on
the destination machine as an additional hierarchy to compare destination