[OSM-dev] TIGER import.rb

Dave Hansen dave at sr71.net
Tue Jun 19 02:58:57 BST 2007


On Mon, 2007-06-18 at 17:57 -0700, Brandon Martin-Anderson wrote:
> >From before: The endpoints of TIGER features have a globally unique
> ID called  "zero cell id" or sometimes "tzid". that's what you'll find
> tagged to the endpoints of some ways.

Do you think we should fill this in on all intermediate points in a way?
It might make it easier to track things down in the future if people are
editing and split/join ways in the process.

> Regarding weird errors. First thing I'd do is find some way to view
> the same TIGER file in some other viewer, just to make sure it's not a
> glitch in the TIGER data. In the past I've run into problems like this
> and spent hours hunting it down, just to find the error was just in
> crumby data. If that's not the case, I'll go in sleeves rolled up to
> see about the bug. 

I guess it's possible that these are not using the exact same data set,
but here are two views of the same area with none of the strange
segments.

http://sigma.openplans.org/?lat=38.91436&lon=-84.92017&zoom=14&layers=BT
http://tiger.census.gov/cgi-bin/mapsurfer?act=out&outfact=2&map.x=211&map.y=180&lat=38.9147263&lon=-84.9243622&wid=0.022&ht=0.009&iht=359&iwd=422&&&&tlevel=-&tvar=-&tmeth=i&mlat=&mlon=&msym=bigdot&mlabel=&murl=&conf=mapnew.con

Do you know of any standalone viewers for the TIGER data? 

> I haven't had a moment to sit down with your code, so I'm noticing a
> few things for the first time. You implemented grouping of ways by
> street name, which is good for map rendering. There are some cases
> where I'd like to see a non-grouped osm.xml, like if I'm curious about
> the TLID of a buggy feature. Perhaps we could implement a command-line
> switch to turn grouping behavior on and off.

Done.  You can add --no-coalesce to the command line now. 

Here's another diff against the code that you sent before.  I've made
some speed improvements from before (function calls in ruby are SLOW!).
I've also added some improved progress meters which really help you
figure out why your system is chugging along with some of the larger
data sets.

> Your example TIGER file is _beautiful_ compared to the one I slapped
> into the OSM map a week ago. 

Thanks.  Nice rendering, plus ease in editing is why I started hacking
on it to begin with.  I'm glad you like how it looks.

-- Dave

diff -ruN orig/dave_model.rb dave-model/dave_model.rb
--- orig/dave_model.rb	1969-12-31 16:00:00.000000000 -0800
+++ dave-model/dave_model.rb	2007-06-18 18:48:25.000000000 -0700
@@ -0,0 +1,215 @@
+module OSMModel
+require 'fileutils'
+require 'find'
+require 'net/http'
+require 'timeout'
+require 'uri'
+
+def debug(string)
+	#$stderr.puts string
+end
+
+def hash_to_tags(h)
+	s = ''
+	if ! h.empty?
+		s += "\n"
+	end
+	h.each_pair do |key, value|
+		if (value.nil? || (value.to_s.length == 0))
+			#$stderr.print(sprintf("hash_to_tags() bad value[%s]: '%s'\n", key, value.to_s))
+			next
+		end
+		value.gsub(/\'/, "\\'")
+		value.gsub(/\&/, "amp")
+		s += "\t<tag k=\"#{key}\" v=\"#{value}\"/>\n"
+	end
+	s
+end
+module_function :hash_to_tags
+$c1 = 0
+$c2 = 0
+class Point
+	attr_accessor :osmid
+        def initialize point, tags={} #expect long, lat point
+	      @osmid = 0
+	      @visible="true"
+	      @lat = point[1]
+	      @lon = point[0]
+	      @tags = tags
+	end
+	def distance(lat, lon)
+		latdiff = (lat - @lat).abs
+		londiff = (lon - @lon).abs
+		Math.sqrt(latdiff*latdiff + londiff*londiff)
+	end
+	def to_xml
+		return "" if @printed
+		@printed = true
+		return "<node id=\'#{@osmid}\' visible=\'true\' lat=\'#{@lat}\' lon=\'#{@lon}\'>" +
+			OSMModel.hash_to_tags(@tags) + "</node>\n"
+	end
+	def id
+		@osmid
+	end
+	def to_s
+		"lat: #{@lat} lon: #{@lon} id: #{@id}"
+	end
+end
+ 
+# Create a new object
+# g = Greeter.new("world")
+#  
+#  # Output "Hello World!"
+#  g.salute
+
+class Segment
+	attr :to
+	attr :from
+	attr :tags
+	attr_accessor :osmid
+	def initialize(from, to, tags = {})
+		@from = from
+		@to = to
+		@tags = tags
+		@printed = false
+		#debug("created segment: " + self.to_s)
+  	end
+	def comes_before(seg)
+		if self.to == seg.from
+			return true
+		end
+		return false
+	end
+	def shares_origin(seg)
+		return self.from == seg.from
+	end
+	def shares_terminus(seg)
+		return self.to == seg.to
+	end
+	def reverse
+		tmp = @from
+		@from = @to
+		@to = tmp
+	end
+	def to_xml
+		return "" if @printed
+		@printed = true
+		return from.to_xml() +
+		       to.to_xml() + 
+		"<segment from=\'#{from.id}\' to=\'#{to.id}\' id=\'#{self.id}\' visible=\'true\'>" +
+		OSMModel.hash_to_tags(@tags) +
+		"</segment>\n"	
+	end
+	def id
+		@osmid
+	end
+	def to_s
+		"segment:\{id:#{@osmid} from:#{@from.to_s} to:#{@to.to_s}\}"
+	end
+end
+ 
+def create_segment(from_point, to_point, tags = {})
+	return Segment.new(from_point, to_point, tags)
+end
+
+
+class Way
+	attr :segs
+	attr :tags
+	attr_accessor :osmid
+	def initialize(segs, tags = {})
+		@segs = []
+		segs.each do |seg|
+			self.add_segment(seg)
+		end
+		@tags = tags
+  	end
+	def reverse
+		@segs.each do |seg|
+			seg.reverse
+		end
+		@segs.reverse!
+	end
+	def name
+		return @tags["name"]
+	end
+	def __add_segment(newseg)
+		if @segs.empty?
+			#debug("added " + newseg.to_s  + " to empty way #{@osmid}")
+			@segs.push(newseg)
+			return true
+		end
+		if newseg.comes_before(@segs.first)
+			#debug("added "+newseg.to_s+" to head of way #{@osmid} before " + segs.first.to_s)
+			@segs.unshift(newseg)
+			return true
+		end
+		if @segs.last.comes_before(newseg)
+			#debug("added "+newseg.to_s+" to tail of way #{@osmid} after " + segs.last.to_s)
+			@segs.push(newseg)
+			return true
+		end
+		return false
+	end
+	def add_segment(newseg)
+		if __add_segment(newseg)
+			return true
+		end
+		# try the segment in the opposite direction
+		newseg = Segment.new(newseg.to, newseg.from, newseg.tags)
+		return __add_segment(newseg)
+	end
+	def combine_with(way)
+		# check tag compatibility here
+		if ((way.segs.first.to == segs.first.to) ||
+		    (way.segs.last.to == segs.last.to))
+			way.reverse
+		end
+			
+		combined = false
+		if segs.last.to == way.segs.first.from
+			@segs.concat(way.segs)
+			combined = true
+		elsif way.segs.last.to == @segs.first.from
+			@segs = way.segs | @segs
+			combined = true
+		end
+		# if the passed in way is empty, we've succeeded
+		return combined
+	end
+	def to_xml
+		@segs.map { |seg| seg.to_xml }.join +
+		"<way id=\'#{@osmid}' visible=\'true\'>" +
+		@segs.map { |seg| "\n\t<seg id=\'#{seg.id}\'/>" }.join +
+		OSMModel.hash_to_tags(@tags) +
+		"</way>\n"
+	end
+	def name
+		return @tags["name"]
+	end
+	def id
+		@osmid
+	end
+	def verify
+		return
+		last_seg = nil
+		@segs.each do |seg|
+			if !last_seg.nil?
+				if last_seg.to != seg.from
+					$stderr.puts "invalid way: " + self.to_s
+					exit(1)
+				end
+			end
+			last_seg = seg
+		end
+	end
+	def to_s
+		ret = "way:\{id:#{@osmid} segments: ["
+		@segs.each do |seg|
+			ret += "\n   " + seg.to_s + ","
+		end
+		ret += "]\}"
+	end
+end
+
+end
diff -ruN orig/foo.rb dave-model/foo.rb
--- orig/foo.rb	1969-12-31 16:00:00.000000000 -0800
+++ dave-model/foo.rb	2007-06-18 18:48:25.000000000 -0700
@@ -0,0 +1,11 @@
+require 'to_osm.rb'
+
+GC.disable
+dataset = TigerLine::Dataset.new ARGV[0]
+dataset.read
+
+out = dataset.to_osm_xml
+
+fp = File.new dataset.filename_base+".osm.xml", "w"
+fp.write( out )
+fp.close
diff -ruN orig/tiger.rb dave-model/tiger.rb
--- orig/tiger.rb	2007-06-16 07:29:23.000000000 -0700
+++ dave-model/tiger.rb	2007-06-18 18:48:25.000000000 -0700
@@ -118,19 +118,20 @@
  
       fp_size = File.size filename
       fp = File.new filename, "r"
-      i=0
+      lastper = 0
       fp.each_line do |line|
-        if i%5000 == 0 then print sprintf("%.1f", (Float(fp.pos)/fp_size)*100 ) + "%\n" end
-
+	per = fp.pos*100/fp_size
+	if per >= lastper+4
+		print sprintf("\rparsed %d/%d kbytes (%d%%)", fp.pos/1024, fp_size/1024, per);
+	end
         record = record_class.new( line )
         if key_field then
           @records[ record[ key_field ] ] = record
         else
           @records << record
         end
-
-        i += 1
       end
+      print "...done\n"
       fp.close
 
       @records
@@ -206,6 +207,10 @@
     attr_reader :filename_base, :features
 
     def initialize directory
+      if FileTest.directory?(directory) == false
+	$stderr.puts directory + " is not a directory"
+	exit 1;
+      end
       @features = nil
       @filename_base = Dir["#{directory}/*.RT1"].first
       if @filename_base then
@@ -261,14 +266,20 @@
         rt1_records[ record[:tlid] ].rti_record = record
       end
 
-      print "Parsing TIGER records into features...\n"
       i=0
       n=rt1_records.size
+      printf "Parsing %d TIGER records into features...\n", n
+      lastper = 0;
       rt1_records.each do |key, record|
-        i += 1; if i%5000 == 0 then print sprintf("%.1f", (Float(i)/n)*100 ) + "%\n" end
-        
+        i += 1
+	per = 100*i/n
+	if per != lastper
+		print sprintf("\rturned %d/%d records into features (%d%%)", i, n, per)
+        end
+	lastper = per
         @features[ key ] = LineFeature.new( record )
       end
+      print "...done\n"
 
       true
     end
diff -ruN orig/tiger_to_osm.rb dave-model/tiger_to_osm.rb
--- orig/tiger_to_osm.rb	2007-06-16 07:29:23.000000000 -0700
+++ dave-model/tiger_to_osm.rb	2007-06-18 18:48:25.000000000 -0700
@@ -1,9 +1,23 @@
 require 'to_osm.rb'
+require 'getoptlong'
 
-dataset = TigerLine::Dataset.new ARGV[0]
+opts = GetoptLong.new(
+     [ '--no-coalesce', '-n', GetoptLong::NO_ARGUMENT ]
+    )
+
+coalesce = true
+options = Hash.new
+opts.each do |opt, arg|
+      case opt
+        when '--no-coalesce'
+          coalesce = false
+      end
+end
+dir = ARGV[0]
+dataset = TigerLine::Dataset.new(dir)
 dataset.read
 
-out = dataset.to_osm_xml
+out = dataset.to_osm_xml(coalesce)
 
 fp = File.new dataset.filename_base+".osm.xml", "w"
 fp.write( out )
diff -ruN orig/to_osm.rb dave-model/to_osm.rb
--- orig/to_osm.rb	2007-06-16 07:29:23.000000000 -0700
+++ dave-model/to_osm.rb	2007-06-18 18:48:25.000000000 -0700
@@ -3,97 +3,176 @@
 require 'tiger'
 require 'attr_map'
 require 'cgi' #to escape srings
-require 'osm_model'
+require 'dave_model'
 
 module TigerLine
 
-  class Dataset
+class Dataset
+    def new_point(lat, lon, tags = {})
+    	point = nil
+    	key = lat+lon
+    	loop = 0;
+    	while $points.has_key?(key)
+    		point = $points[key]
+    		return point if point.distance(lat,lon) < 0.0000001
+    		key += lat+lon
+    		loop+=1
+    		$stderr.puts "loop nr: #{loop}" if loop > 10
+    	end
+    	point = Point.new(lat, lon, tags)
+    	while $points.has_key?(key)
+    		key += lat+lon
+    	end
+    	$points[key] = point
+    	point
+    end
+    
+    def create_way(coalesce, segs, tags = {})
+	if @ways.nil?
+		@ways = Hash.new
+	end
+    	name = tags["name"]
+ 	if @ways[name].nil?
+    		#debug("creating array for \"" + name + "\"")
+    		@ways[name] = []
+    	end
+    	new_way = OSMModel::Way.new(segs, tags)
+    	merged = false
+  	@ways[name].each do |way|
+		if ! coalesce
+			break
+		end
+    		if ! way.combine_with(new_way)
+			next
+		end
+    		if merged == false 
+    			# we actually merged the truly
+    			# new way into an existing one
+    			merged = true
+    		else
+    			# the new way connected two older ways,
+    			# we need to delete the way that 
+    			# was consolidated
+    			@ways[name].delete(new_way)
+    		end
+    		new_way = way
+    	end
+    
+    	# we did not consolidate this way into another, make sure
+    	# to get it into the hash as a new entry
+    	if merged == false
+    		@ways[name].push(new_way)
+    	end
+    end
+    
 
-    def to_osm
+    def to_osm(coalesce)
       ret = []
       tzid_to_osm = {}
 
+      printf "converting %d TIGER features into OSM objects...\n", @features.size
+      nr = 0
       @features.each_pair do |tlid, feature|
+	nr += 1
+	if (nr%100 == 0)
+      	  $stderr.write(sprintf("\rconverted %d/%d TIGER features to OSM (%5.2f%%)", nr, @features.size, (nr.to_f*100/@features.size)))
+	end
         unless tags = TIGER_TO_OSM[feature.cfcc.downcase.intern] then
           next
         end
 
         osmpoints = []
-        osmpoints << (tzid_to_osm[feature.tzids] ||= OSMModel::Point.new( feature.points.first, {"tzid" => feature.tzids} ))
+
+	node_tags = Hash.new
+        node_tags["source"] = "tiger_import_#{Time.now.strftime("%Y%m%d")}"
+	node_tags["tiger:tzid"] = feature.tzids
+        osmpoints << (tzid_to_osm[feature.tzids] ||= OSMModel::Point.new( feature.points.first, node_tags))
         feature.points[1..-2].each do |point|
-          osmpoints << OSMModel::Point.new( point )
+          osmpoints << OSMModel::Point.new( point, node_tags )
         end
-        osmpoints << (tzid_to_osm[feature.tzide] ||= OSMModel::Point.new( feature.points.last, {"tzid" => feature.tzide} ))
-
-        way = OSMModel::Way.new
+        osmpoints << (tzid_to_osm[feature.tzide] ||= OSMModel::Point.new( feature.points.last, node_tags))
 
+	way_segs = []
         osmpoints[0..-2].each_with_index do |point, i|
-          way.segs << OSMModel::Segment.new( point, osmpoints[i+1] )
+		segment = OSMModel::Segment.new( point, osmpoints[i+1] )
+        	if feature.tzids.to_i == 10602252
+			$stderr.print(sprintf("segment created with tzid: (%s)\n", segment.to_s))
+		end
+		way_segs << segment
         end
 
+	way_tags = Hash.new
         #name tags
         feature.names.each_with_index do |name, i|
           strname = "#{name[:fedirp]} #{name[:fename]} #{name[:fetype]} #{name[:fedirs]}".strip
           ord_suffix = ("_#{i}" if i>0) or ""
-          way.tags["name#{ord_suffix}"] = "#{CGI.escapeHTML(strname)}"
-          way.tags["name_direction_prefix#{ord_suffix}"] = "#{CGI.escapeHTML(name[:fedirp])}" if not name[:fedirp].empty?
-          way.tags["name_base#{ord_suffix}"] = "#{CGI.escapeHTML(name[:fename])}" if not name[:fename].empty?
-          way.tags["name_type#{ord_suffix}"] = "#{CGI.escapeHTML(name[:fetype])}" if not name[:fetype].empty?
-          way.tags["name_direction_suffix#{ord_suffix}"] = "#{CGI.escapeHTML(name[:fedirs])}" if not name[:fedirs].empty?
+          way_tags["name#{ord_suffix}"] = "#{CGI.escapeHTML(strname)}"
+          way_tags["name_direction_prefix#{ord_suffix}"] = "#{CGI.escapeHTML(name[:fedirp])}" if not name[:fedirp].empty?
+          way_tags["name_base#{ord_suffix}"] = "#{CGI.escapeHTML(name[:fename])}" if not name[:fename].empty?
+          way_tags["name_type#{ord_suffix}"] = "#{CGI.escapeHTML(name[:fetype])}" if not name[:fetype].empty?
+          way_tags["name_direction_suffix#{ord_suffix}"] = "#{CGI.escapeHTML(name[:fedirs])}" if not name[:fedirs].empty?
         end
         #address range tags
         feature.address_ranges.each_with_index do |range, i|
           ord_suffix = ("_#{i}" if i>0) or ""
-          way.tags["from_address_right#{ord_suffix}"] = "#{range[:fraddr]}" if not range[:fraddr].empty?
-          way.tags["to_address_right#{ord_suffix}"] = "#{range[:toaddr]}" if not range[:toaddr].empty?
-          way.tags["from_address_left#{ord_suffix}"] ="#{range[:fraddl]}" if not range[:fraddl].empty?
-          way.tags["to_address_left#{ord_suffix}"] = "#{range[:toaddl]}" if not range[:toaddl].empty?
-          way.tags["zip_left#{ord_suffix}"] = "#{range[:zipl]}" if not range[:zipl].empty?
-          way.tags["zip_right#{ord_suffix}"] = "#{range[:zipr]}" if not range[:zipr].empty?
+          way_tags["from_address_right#{ord_suffix}"] = "#{range[:fraddr]}" if not range[:fraddr].empty?
+          way_tags["to_address_right#{ord_suffix}"] = "#{range[:toaddr]}" if not range[:toaddr].empty?
+          way_tags["from_address_left#{ord_suffix}"] ="#{range[:fraddl]}" if not range[:fraddl].empty?
+          way_tags["to_address_left#{ord_suffix}"] = "#{range[:toaddl]}" if not range[:toaddl].empty?
+          way_tags["zip_left#{ord_suffix}"] = "#{range[:zipl]}" if not range[:zipl].empty?
+          way_tags["zip_right#{ord_suffix}"] = "#{range[:zipr]}" if not range[:zipr].empty?
         end
         #preserved tiger fields
-        way.tags["tiger:tlid"] = "#{@tlid}"
-        way.tags["tiger:cfcc"] = "#{@cfcc}"
+        way_tags["tiger:tlid"] = "#{@tlid}"
+        way_tags["tiger:cfcc"] = "#{@cfcc}"
         #misc tags
-        way.tags["source"] = "tiger_import_#{Time.now.strftime("%Y%m%d")}"
-        way.tags["reviewed"] = "no"
+        way_tags["source"] = "tiger_import_#{Time.now.strftime("%Y%m%d")}"
+        way_tags["reviewed"] = "no"
         #cfcc tags
         tags.each_pair do |key, value|
-          way.tags["#{CGI.escapeHTML(key.to_s)}"] = "#{CGI.escapeHTML(value.to_s)}"
+          way_tags["#{CGI.escapeHTML(key.to_s)}"] = "#{CGI.escapeHTML(value.to_s)}"
         end
-
-        ret << way
+	way = create_way(coalesce, way_segs, way_tags)
       end
-
+      printf "...done\n"
       return ret
     end
 
-    def to_osm_xml
-      ways = self.to_osm
+    def to_osm_xml coalesce
       ret = []
-      
-      osmid = 0
-
-      ways.each do |way|
-        way.segs.each do |seg|
-          if seg.from.osmid==0 then 
-            seg.from.osmid = (osmid+=1)
-            ret << seg.from.to_xml
-          end
-          if seg.to.osmid==0 then 
-            seg.to.osmid = (osmid+=1)
-            ret << seg.to.to_xml
-          end
-          seg.osmid = (osmid+=1)
-          ret << seg.to_xml
-        end
-        way.osmid = (osmid+=1)
-        ret << way.to_xml
+      self.to_osm(coalesce)
+      osmid = -1
+      ret << "<?xml version='1.0' encoding='UTF-8'?>\n"
+      ret << "<osm version='0.4' generator='JOSM'>\n"
+
+      printf "writing %d ways into xml\n", @ways.size
+      nr = 0
+      @ways.each_key do |name|
+	if nr%(@ways.size/100) == 0
+      	  $stderr.write(sprintf("done with %d ways (%d%%)\r", nr, (nr*100/@ways.size)))
+	end
+	nr+=1
+      	@ways[name].each do |way|
+        	way.segs.each do |seg|
+			if seg.from.osmid==0 then 
+				seg.from.osmid = (osmid-=1)
+				ret << seg.from.to_xml
+			end
+			if seg.to.osmid==0 then 
+				seg.to.osmid = (osmid-=1)
+				ret << seg.to.to_xml
+			end
+			seg.osmid = (osmid-=1)
+			ret << seg.to_xml
+		end
+		way.osmid = (osmid-=1)
+		ret << way.to_xml
+	end
       end
-
-      return ret.join("\n")
+      printf "\ndone\n";
+      ret << "</osm>"
+      return ret.join("")
     end
-
-  end
+end
 
 end






More information about the dev mailing list