# TODO: Add comment # # Author: Gene ############################################################################### library(XML) ## You will have to set up your own directory ## Make sure that "Functions" is a sub directory with the dependent functions setwd('~/Dropbox/R/XLConnect') rm(list=ls()) ##----------------------------------------------------------------------------- ## Define initial URL ##----------------------------------------------------------------------------- sunriseURL = paste( 'http://www.timeanddate.com/worldclock/astronomy.html', '?n=64&month=1&year=2012&obj=sun&afl=-1&day=1', sep='') sunriseURL ##----------------------------------------------------------------------------- ## Use XML library to get first HTML table as an example ##----------------------------------------------------------------------------- ExampleData = readHTMLTable(sunriseURL, header=T, which=1, stringsAsFactors=F) ##----------------------------------------------------------------------------- ## Let's see what we've got ##----------------------------------------------------------------------------- ExampleData str(ExampleData) colnames(ExampleData) ExampleData[,1] ExampleData[1,] ##----------------------------------------------------------------------------- ## DATA CLEANSING DEVELOPMENT ## Develop some code to clean up messy table ##----------------------------------------------------------------------------- ## Make a copy of the original download to make it easier to go back examp = ExampleData location = colnames(examp)[1] ## Extract location examp = examp[5:nrow(examp),] ## Delete first 4 rows examp = examp[1:(nrow(examp)-1), ] ## Delete last row rownames(examp) = NULL ## Restart row numbers ## Manually define column names colnames(examp) = c( 'Date', 'Astronomical_Twilight_Starts', 'Astronomical_Twilight_Ends', 'Nautical_Twilight_Starts', 'Nautical_Twilight_Ends', 'Civil_Twilight_Starts', 'Civil_Twilight_Ends', 'Sunrise', 'Sunset', 'Azimuth_Sunrise', 'Azimuth_Sunset', 'Day_Length', 'Day_Length_Difference', 'Solar_Noon_Time', 'Solar_Noon_Altitude', 'Distance_10e6_KM') examp ## Looking a lot better already ## Convert the Date examp$Date examp$Date = as.Date(examp$Date, "%b %d, %Y") examp$Date str(examp) ##----------------------------------------------------------------------------- ## DATA CLEANSING DEVELOPMENT (CONTINUTED) ## Work on the columns containing time values ##----------------------------------------------------------------------------- ## Define the columns with time values manually jj = c(2:9, 14) ## Remove all the 12:00 Noon references and replace with 12:00 PM examp[,jj] = sapply(examp[ , jj], function(x) gsub('12:00 Noon', '12:00 PM',x)) ## Join date column with time columns (cols 2-9, and col 14) examp[,jj] = sapply(jj, function(x) paste(examp[,1], examp[,x])) ## Convert times columns to a date/time format strptime(examp[,2], "%Y-%m-%d %I:%M %p") examp[,jj] = sapply(jj, function(x) strptime(examp[,x], "%Y-%m-%d %I:%M %p")) str(examp) examp[,jj] ##----------------------------------------------------------------------------- ## DATA CLEANSING DEVELOPMENT (CONTINUTED) ## Convert numeric data to numeric ##----------------------------------------------------------------------------- examp$Azimuth_Sunrise = as.numeric( substr(examp$Azimuth_Sunrise, 1, nchar(examp$Azimuth_Sunrise)-1)) examp$Azimuth_Sunset = as.numeric( substr(examp$Azimuth_Sunset, 1, nchar(examp$Azimuth_Sunset)-1)) examp$Solar_Noon_Altitude = as.numeric( substr(examp$Solar_Noon_Altitude, 1, nchar(examp$Solar_Noon_Altitude)-1)) examp$Distance_10e6_KM = as.numeric( substr(examp$Distance_10e6_KM, 1, nchar(examp$Distance_10e6_KM)-1)) ##----------------------------------------------------------------------------- ## DATA CLEANSING DEVELOPMENT (CONTINUTED) ## Convert day length to decimal ##----------------------------------------------------------------------------- str(examp) ## "Day_Length" examp$Day_Length ## Make a copy of Day Length for ease of typing tmp = examp$Day_Length dayhours = as.numeric(gsub('h.+','', tmp)) dayminutes = as.numeric(gsub('.+h |m .+','', tmp)) dayseconds = as.numeric(gsub('.+m |s','', tmp)) ## Replace "Day Length" examp$Day_Length = dayhours/24 + dayminutes / 24 / 60 + dayseconds / 24 / 60 / 60 ## New "Day Length" examp$Day_Length ## Clean up rm(tmp, dayhours, dayminutes, dayseconds) ## "Day_Length_Difference" examp$Day_Length_Difference tmp = examp$Day_Length_Difference ## Standardize the vector to include minutes tmp = gsub('\\+ ', '\\+ 0m ', tmp) ## Extract time components daysign = substr(tmp, 1, 1) dayminutes = as.numeric(gsub('^. |m .+','', tmp)) dayseconds = as.numeric(gsub('.+m |s','', tmp)) daysignvector = rep(1, length(daysign)) daysignvector[daysign=='-'] = -1 ## New "Day Length Difference" examp$Day_Length_Difference = (dayminutes / 24 / 60 + dayseconds / 24 / 60 / 60) * daysignvector ## Clean up rm(tmp, daysign, daysignvector, dayminutes, dayseconds) ##----------------------------------------------------------------------------- ## Converted! ##----------------------------------------------------------------------------- examp str(examp)