From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.2.5 (2008-06-10) on sa.int.altlinux.org X-Spam-Level: X-Spam-Status: No, score=-2.1 required=5.0 tests=AWL,BAYES_00,SPF_PASS autolearn=ham version=3.2.5 Date: Fri, 20 Feb 2009 23:19:13 +0200 From: Igor Vlasenko To: devel@lists.altlinux.org Message-ID: <20090220211913.GA32271@dad.imath.kiev.ua> MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="17pEHd4RhPHOinZp" Content-Disposition: inline Content-Transfer-Encoding: 8bit User-Agent: Mutt/1.5.18 (2008-05-17) Received-SPF: pass (dad.imath.kiev.ua: domain of vlasenko@dad.imath.kiev.ua designates 127.0.0.1 as permitted sender) receiver=dad.imath.kiev.ua; client-ip=127.0.0.1; helo=dad.imath.kiev.ua; envelope-from=vlasenko@dad.imath.kiev.ua; x-software=spfmilter 0.95 http://www.acme.com/software/spfmilter/ with libspf2; Subject: [devel] rpmlndup X-BeenThere: devel@lists.altlinux.org X-Mailman-Version: 2.1.10b3 Precedence: list Reply-To: ALT Linux Team development discussions List-Id: ALT Linux Team development discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 20 Feb 2009 21:19:18 -0000 Archived-At: List-Archive: List-Post: --17pEHd4RhPHOinZp Content-Type: text/plain; charset=utf-8 Content-Disposition: inline Content-Transfer-Encoding: 8bit Раз уже пошла тема о скриптах, поделюсь скриптом rpmlndup. =head1 NAME rpmlndup - a tool that reduces rpm repositories size by hardlinking identical rpms. identical = name, real size and sha1/md5 sig are the same. когда я его у себя запустил, то винт похудел на 70Гб, и несмотря на то, что я пользуюсь --link-dest. --link-dest ко всем dest не напишешь. Если будет интерес, напишу help и выложу в Сизиф. -- Dr. Igor Vlasenko -------------------- Topology Department Institute of Math Kiev, Ukraine --17pEHd4RhPHOinZp Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename=rpmlndup #!/usr/bin/perl -w use strict; use warnings; use File::Find; use RPM::Header; use Getopt::Long; my $verbose=1; my $skipnosum=0; my $result = GetOptions ( 'quiet'=> sub {$verbose=0}, "skip-no-sum" => \$skipnosum, "verbose+" => \$verbose, ); my @directories = @ARGV; map {-d $_ or die "argument is not a directory: $_\n"} @directories; # first step is just a usual find; to find dup names my %rpmbyname; find(\&wanted, @directories); sub wanted { # $File::Find::dir = /some/path/ # $_ = foo.ext # $File::Find::name = /some/path/foo.ext my $name=$_; return unless /\.rpm$/ and not -l $_; $rpmbyname{$name}=[] unless defined $rpmbyname{$name}; my @stat=stat $name; # 0 dev device number of filesystem # 1 ino inode number # 2 mode file mode (type and permissions) # 3 nlink number of (hard) links to the file # 4 uid numeric user ID of file's owner # 5 gid numeric group ID of file's owner # 6 rdev the device identifier (special files only) # 7 size total size of file, in bytes # 8 atime last access time in seconds since the epoch # 9 mtime last modify time in seconds since the epoch # 10 ctime inode change time in seconds since the epoch (*) # 11 blksize preferred block size for file system I/O # 12 blocks actual number of blocks allocated my $size = $stat[7]; push @{$rpmbyname{$name}}, { NAME=> $name, # DIR => $File::Find::dir, PATH=> $File::Find::name, INODE => $stat[1], SIZE => $stat[7], }; } # second step is to find genuine dups; the same size and sha1/md5sum. my %rpmbysum; while (my ($rpm, $lptr)=each %rpmbyname) { next if $#{$lptr}<1; my %inodes; map {$inodes{$_->{INODE}}=1} @$lptr; next if scalar keys(%inodes) < 2; map {&bysum($_)} @$lptr; } undef %rpmbyname; my $dupcount=0; my $economy=0; my @rpmtolink; while (my ($rpm, $lptr)=each %rpmbysum) { next if $#{$lptr}<1; my %inodes; map {$inodes{$_->{INODE}}=1} @$lptr; next if scalar keys(%inodes) < 2; my $dupnum=keys(%inodes)-1; #print "$rpm\n"; $economy+=$lptr->[0]->{SIZE}*$dupnum; $dupcount+=$dupnum; push @rpmtolink, $lptr; } undef %rpmbysum; print STDERR "hardlinking duplicate rpms will give print total economy: $economy bytes in $dupcount rpms.\n"; print STDERR "Do you want to continue (y/n)?.\n"; @ARGV=(); $_=<>; exit 0 unless (/^\s*y/i); print "continue with ".scalar @rpmtolink." dups\n"; foreach my $lref (@rpmtolink) { die "internal error! not enough files!" if @$lref < 2; my $master=$lref->[0]; my $masterinode=$master->{INODE}; my $masterpath=$master->{PATH}; for (my $i=1; $i < @$lref; $i++) { my $slave=$lref->[$i]; my $slavepath=$slave->{PATH}; #warn "already linked $masterpath $slavepath\n" if $slave->{INODE} = $masterinode; if ($slave->{INODE} != $masterinode) { die "impossible :(" if $slavepath eq $masterpath; rename $slavepath, $slavepath.'.bak' || die "rename $slavepath, $slavepath.bak failed: $!"; unless (link $masterpath, $slavepath) { warn "link $masterpath, $slavepath failed: $!"; rename $slavepath.'.bak', $slavepath; die "execution aborted."; } system('touch','-acm','-r',$slavepath.'.bak','--',$slavepath); unlink $slavepath.'.bak' || die "cleanup of $slavepath failed: $!"; print "linked successfully: $masterpath -> $slavepath\n" if $verbose; } } } sub bysum { my $rpm=$_[0]; my $size = $rpm->{SIZE}; my $header; eval { $header=new RPM::Header $rpm->{PATH}; }; if ($@) { warn "$rpm->{PATH} skipped: $@\n" if $verbose; return; } my $sum = $header->{SHA1HEADER}->[0]; unless ($sum) { warn "no sha1sum for $rpm->{NAME} - trying MD5\n" if $verbose; $sum = $header->{SIGMD5}->[0]; unless ($sum) { warn "no md5sum for $rpm->{NAME}\n" if $verbose; return if $skipnosum; # let at list declared size be the same $sum=$header->{SIGSIZE}->[0]; $sum||=$size; } } $rpm->{SUM}=$sum; my $key=$rpm->{NAME}.'!'.$sum.'|'.$size; $rpmbysum{$key}=[] unless defined $rpmbysum{$key}; push @{$rpmbysum{$key}}, $rpm; } =head1 NAME rpmlndup - a tool that reduces rpm repositories size by hardlinking identical rpms. =head1 SYNOPSIS B [B<-h|--help>] [B<-v|--verbose>] [B<-q|--quiet>] [B<-y|--yes|--batch>] [B<-a|--ask|--interactive>] [B<-n|--no|--count>] [B<-s|--skip-no-sum>] [I...] =head1 DESCRIPTION B =head1 OPTIONS =over =item B<-h, --help> Display this help and exit. =item B<-v, --verbose>, B<-q, --quiet> Verbosity level. Multiple -v increase the verbosity level, -q sets it to 0. =item B<-y|--yes>, B<--batch> Batch mode. links identical rpm after counting. =item B<-n|--no>, B<--count> No linking identical rpm, just counting space to be freed. =item B<-a|--ask>, B<--interactive> Interactive mode (default). Counts free space and asks to proceed with linking. =item B<-s|--skip-no-sum> Skip unsigned rpms (that have no sha1 or md5 sum). =back =head1 AUTHOR Written by Igor Vlasenko . =head1 COPYING Copyright (c) 2009 Igor Vlasenko, ALT Linux Team. This is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. =cut --17pEHd4RhPHOinZp--