[OSM-dev] Too many slow queries in db

Tue Sep 4 05:57:31 BST 2007

On Tue, 2007-09-04 at 00:28 +0100, Jon Burgess wrote:
> I've also got some evidence that the 758M object size quoted on the
> tiger stats page is wrong, with the true figure being only half that
> size. I'm still downloading more of the data to confirm this.

Well, for goodness sake, please keep them to yourself!

Seriously, though.  I wrote this pile in about 20 seconds and never
thought about it again.  This generates the "statfiles" and the perl
script below generates the web page.  Go to town :)

#!/bin/sh
find counties/ -name '*.osm.gz' -or -name '*.osm' | while read i; do
        [ -e  "${i/.gz/}.stats" ] && continue;
        echo $i;
        z=""
        if [ "${i/.gz/}" != "$i" ]; then
                z="z"
        fi
        ${z}cat "$i" | awk '/^<node/ {nodes++;}
                         /^<segment/ {segs++;}
                         /^<way/ {ways++;}
                         END {print "nodes   : ", nodes;
                              print "segments: ", segs;
                              print "ways    : ", ways;}' \
        > "${i/.gz/}.stats" || break
done

#!/usr/bin/perl
# make stat files:
# find counties/ -name '*.osm.gz' | while read i; do [ -e  "$i.stats" ] && continue; echo $i;  zcat "$i" | awk '/^<node/ {nodes++;} /^<segment/ {segs++;} /^<way/ {ways++;} END {print "nodes   : ", nodes; print "segments: ", segs; print "ways    : ", ways;}' > "$i.stats"; done
use daveperl;

print "<HTML><PRE>\n";
print `date`;
my @states = ls_nodot('counties');
my $state_dir = 'counties';

sub read_into_hash
{
        my $file = shift;
        my $separator = shift;
        if (!length($separator)) {
                $separator = ':';
        }
        if (! -e $file) {
                print "$file does not exist\n";
        }
        my $hash;
        foreach my $line (cat_file_into_array($file)) {
                $line =~ s/\s//g;
                my ($var, $val) = split($separator, $line);
                $hash->{$var} = $val;
        }
        return $hash;
}

my $in_progress;
my $country;
foreach my $state (@states) {
        my $state_stat_file = "$state_dir/$state/.stats";
        my $state_stat_hash = read_into_hash($state_stat_file);
        my @counties = grep(/\.(osm|gz)$/,ls("$state_dir/$state"));
        #printf "nr counties in dir: %d\n", scalar @counties;
        #printf "nr counties in stat file: %d\n", $state_stat_hash{counties};
        if ((scalar @counties) == $state_stat_hash{counties}) {
                #next;
        }
        $state_stat_hash = undef;
        foreach my $county (@counties) {
                $county =~ s/.gz$//;
                my $county_file = "$state_dir/$state/$county";
                my $county_stat_hash = read_into_hash("$county_file.stats");
                $county =~ s/.osm$//;
                if (!length(keys(%$county_stat_hash))) {
                        die "bad read of county stat file: $county_file.stats";
                }
                my $is_completed = 0;
                if (-f "$county_file.completed") {
                        printf "%30s complete\n",  "$county, $state";
                        $is_completed = 1;
                        $country{"counties-completed"}++;
                } else {
                        if (-f "$county_file.uuid") {
                                my $log = cat_file("$county_file.log");
                                my @parts = split(/\r/, $log);
                                printf "$county, $state being processed: %s\n", $parts[-1];
                        }
                }
                foreach my $key (keys %$county_stat_hash) {
                        $state_stat_hash->{$key} += $county_stat_hash->{$key};
                        if ($is_completed) { 
                                $state_stat_hash->{"$key-completed"} += $county_stat_hash->{$key};
                        }
                        $state_stat_hash->{$key} += $county_stat_hash->{$key};
                }
        }
        unlink $state_stat_file;
        foreach my $key (keys %$state_stat_hash) {
                my $val = $state_stat_hash->{$key};
                $country{$key} += $state_stat_hash->{$key};
                file_append_line($state_stat_file, "$key:$val");
        }
        file_append_line($state_stat_file, "counties:".scalar(@counties));
        $country{counties} += scalar(@counties);
}
printf "Entire Country:\n";
        printf("%20s %10s/%10s\n", "", "complete", "total");
foreach my $key (sort keys %country) {
        next if ($key =~ /-completed/);
        my $val = $country{$key};
        my $complete = $country{"$key-completed"};
        printf("%20s:%10s/%10s (%5.2f%%)\n", $key, $complete, $val, (100.0*$complete/$val));
}
my $start_time = 1187801839-86400*5;
my $now = time();
my $elapsed = $now - $start_time;
my $total_objects = 0;
my $total_completed_objects = 0;
for $type qw(nodes segments ways) {
        $total_objects += $country{$type};
        $total_completed_objects += $country{"$type-completed"};
}
my $objects_per_sec = $total_completed_objects / $elapsed;
my $required_seconds = $total_objects * $objects_per_sec;
my $required_days = $required_seconds / 86400;
my $required_years = $required_days / 365.25;
printf("at this rate of upload, the entire US will be done in: %d seconds or %d days or %f years\n",
        $required_seconds, $required_days, $required_years);
printf("on %s\n", scalar localtime($required_seconds+$now));
print "</PRE></HTML>\n";

-- Dave