GNU Wget --delete-missing patch

GNU Wget は HTTP, FTP サーバからネタを取ってくる, 再帰的に取ってくるときなど, 非常に便利で, かつ他の同種のものと比べ 比較的お行儀もよいので愛用しているが,

Wget で Web ミラーしてきて Namazu に食わせて検索エンジン作ろう

とかいうときに困るのが, 先方のサーバから消えたファイルは, こちらのミラーでは消えないこと. だからって毎回 rm -r していてはお馬鹿さんである.

下のパッチで, --delete-missing というオプションが増える.
wget-1.5.3 が対象である.

wget --delete-missing --recursive --timestamping \
	--force-directories --no-parent --tries=1 \
	--accept=html,htm,txt --non-verbose \
	http://www.foo.bar.com/

とかやるとよろしかろう.

diff -rcbw wget-1.5.3/src/http.c wget-1.5.3+/src/http.c
*** wget-1.5.3/src/http.c       Thu Oct  1 14:58:07 1998
--- wget-1.5.3+/src/http.c      Thu Oct  1 17:38:48 1998
***************
*** 663,668 ****
--- 663,678 ----
  
    logputs (LOG_VERBOSE, "\n");
  
+   if ((statcode == HTTP_STATUS_NOT_FOUND)
+       && opt.delete_missing
+       && file_exists_p(u->local))
+     {
+       logprintf (LOG_VERBOSE,
+                "Removing %s (URL:%s)\n", u->local, u->url);
+       if(unlink(u->local))
+       logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
+     }
+ 
    if ((statcode == HTTP_STATUS_UNAUTHORIZED)
        && authenticate_h)
      {
diff -rcbw wget-1.5.3/src/init.c wget-1.5.3+/src/init.c
*** wget-1.5.3/src/init.c       Thu Oct  1 14:58:07 1998
--- wget-1.5.3+/src/init.c      Thu Oct  1 17:18:53 1998
***************
*** 94,99 ****
--- 94,100 ----
    { "debug",          &opt.debug,             cmd_boolean },
  #endif
    { "deleteafter",    &opt.delete_after,      cmd_boolean },
+   { "deletemissing",  &opt.delete_missing,    cmd_boolean },
    { "dirprefix",      &opt.dir_prefix,        cmd_string },
    { "dirstruct",      NULL,                   cmd_spec_dirstruct },
    { "domains",                &opt.domains,           cmd_vector },
diff -rcbw wget-1.5.3/src/main.c wget-1.5.3+/src/main.c
*** wget-1.5.3/src/main.c       Thu Oct  1 14:58:07 1998
--- wget-1.5.3+/src/main.c      Thu Oct  1 17:34:28 1998
***************
*** 172,177 ****
--- 172,178 ----
    -r,  --recursive             recursive web-suck -- use with care!.\n\
    -l,  --level=NUMBER          maximum recursion depth (0 to unlimit).\n\
         --delete-after          delete downloaded files.\n\
+        --delete-missing        delete missing files.\n\
    -k,  --convert-links         convert non-relative links to relative.\n\
    -m,  --mirror                turn on options suitable for mirroring.\n\
    -nr, --dont-remove-listing   don\'t remove `.listing\' files.\n\
***************
*** 238,243 ****
--- 239,245 ----
      { "cache", required_argument, NULL, 'C' },
      { "cut-dirs", required_argument, NULL, 17 },
      { "delete-after", no_argument, NULL, 8 },
+     { "delete-missing", no_argument, NULL, 24 },
      { "directory-prefix", required_argument, NULL, 'P' },
      { "domains", required_argument, NULL, 'D' },
      { "dot-style", required_argument, NULL, 6 },
***************
*** 300,305 ****
--- 302,310 ----
          break;
        case 8:
          setval ("deleteafter", "on");
+         break;
+       case 24:  /* better to enum.. */
+         setval ("deletemissing", "on");
          break;
        case 9:
          setval ("retrsymlinks", "on");
diff -rcbw wget-1.5.3/src/options.h wget-1.5.3+/src/options.h
*** wget-1.5.3/src/options.h    Wed Apr 29 06:29:40 1998
--- wget-1.5.3+/src/options.h   Thu Oct  1 17:08:58 1998
***************
*** 132,137 ****
--- 132,139 ----
  
    int delete_after;           /* Whether the files will be deleted
                                   after download. */
+   int delete_missing;         /* Whether the files will be deleted
+                                  if they're missing on the server. */
  };
  
  #ifndef OPTIONS_DEFINED_HERE
diff -rcbw wget-1.5.3/src/version.c wget-1.5.3+/src/version.c
*** wget-1.5.3/src/version.c    Thu Oct  1 14:58:07 1998
--- wget-1.5.3+/src/version.c   Thu Oct  1 17:40:01 1998
***************
*** 1 ****
! char *version_string = "1.5.3";
--- 1 ----
! char *version_string = "1.5.3+";


----------------------------------------------------------------

作者には

これじゃあ FTP ミラーのとき手元が消えない(から没)

といわれたが, とりあえず面倒くさいのでそのまま.

適宜もくじへ戻る/飛ぶ
$Id: wget.html,v 1.3 2006-03-25 14:07:15 morimoto Exp $