#!/usr/bin/env python ''' du.py [visiblesize] [directory]* [--SM] [--SP=filepath]* The script "du.py" determines the sizes of individual files and the accumulated storage attributed to files below each sub-directory. It then sorts these lists and formats them to show the greater users of the disk first while displaying the hierarchy. A numeric argument can be entered to specify the smallest accumulation (in bytes) to be displayed (although everything is accounted for). The program can be given a list of hierarchy roots, or it will default to using just the current working directory. KNOWN BUG: It gets confused when the name of the root of a hierarchy can be validly interpreted as an integer. It uses that integer value to limit the display instead. You can use the idiom "./number_like_name" to avoid this problem. One option specifies a "stop path", a point at which the recursive scan is blocked. This permits one to ignore entire subdirectories, typically those mounted on another partition (in the Unix/Linux world). It's syntax is "--SP=FILEPATH". The "--SM" option says to stop recursion at all mount points. Typically one "cd"s to the root of the hierarchy of interest, then invokes this command with a single numerical argument of, for example, 100000, and redirects the output to a file for later examination. In other words, non-numeric arguments are treated as directory paths, the last numeric argument is used as the limit. Default values are "." and "2048" respectively. While processing a Fedora distribution mirror, I realized that an earlier version of this program counted files with multiple hard links multiple times. The "already_counted" dictionary now keeps track of what files have been counted and (unfairly?) assigns the size of files with multiple hard links to the first instance. (Warning!!! This may give bogus results if the hierarchy crosses a mount point. The key should be derived from the inode and its mount-point.) This utility is inspired by an earlier program I wrote, dusort.py, which was in turn inspired by a shell script dusort. It is implemented entirely in Python so that it can work on both Windows and Linux systems. (Conor Rafferty at Stanford University in 1987, Chip Rosenthal at Unicom Systems Development in 1990, and others were involved in the earler versions of dusort.) Copyright 2010, Randolph Bentson, bentson@holmsjoen.com This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ''' import os import sys import stat # Most operating systems allocate some minimum block size for even the # tiniest file or remaining file fragment. The "bs" factor is intended # to capture the effect of this allocation. bs = 512 bs_1 = bs-1 # This is a list of paths beyond which the scan should # not proceed. In the Linux world these are typically mount # points. It's also possible to just stop at mount points. stop_path = [] skip_mount = False args = [] limit = 2048 already_counted = {} def getinfo(path): ''' Given a starting path in the file hierarchy, generate a list consisting of accumulated disk usage, path basename, and list of subordinate components. The subordinate components will be sorted in increasing order of their accumulated disk usage. ''' subordinates = [] sum = 0 # total accumulation for this path try: names = os.listdir(path) # If a directory cannot be accessed, report its accumulated size as zero. except OSError, e: print e return [0,os.path.basename(path),None] except WindowsError, e: print e return [0,os.path.basename(path),None] for name in names: pathname = os.path.join(path,name) if os.path.isfile(pathname): size = bs*((os.path.getsize(pathname)+bs_1)/bs) fs = os.stat(pathname) if fs[stat.ST_NLINK] > 1: i = fs[stat.ST_INO] if already_counted.has_key(i): already_counted[i][1] += 1 else: already_counted[i] = [size,1] sum += size if size > limit: subordinates.append([size,name,None]) elif os.path.isdir(pathname): if not pathname in stop_path: if skip_mount and os.path.ismount(pathname): continue info = getinfo(pathname) size = bs*((os.path.getsize(pathname)+bs_1)/bs) sum += size # directory itself, sum += info[0] # plus size of children if info[0] > limit: subordinates.append(info) elif os.path.islink(pathname): size = len(name) # + some fudge factor sum += size subordinates.sort() subordinates.reverse() return [sum,os.path.basename(path),subordinates] def display(arg_info,pad): '''Given an input (recursive) list of accumulated size, pathname, and list for sub-directories in the path, report the results with appropriate indentation. ''' for (accumulation, path, kids) in arg_info: print "%10d%s%s" % (accumulation, pad, path) if kids: display(kids,pad+' ') if __name__ == "__main__": for arg in sys.argv[1:]: try: limit = int(arg) except: if len(arg) > 6 and arg[:5] == "--SP=": stop_path.append(arg[5:]) elif len(arg) == 4 and arg=="--SM": skip_mount = True else: args.append(arg) if len(args) == 0: args.append(".") for arg in args: arg = os.path.abspath(arg) arg_info = [getinfo(arg)] print os.path.dirname(arg) display(arg_info, ' ') d = 0 e = 0 for (s,c) in already_counted.values(): if c > 1: d += s e += s*c if d > 0: print "duplicates: %11d" % d print "expanded to: %11d" % e print "saving: %11d" % (e-d) counted = {}