Difference between revisions of "Opasnet Base Connection for R"

From Testiwiki
Jump to: navigation, search
m (Downloading data: utf-8 encoding parameter added to GetLocs)
(Functions: fixed to trim leading and trailing whitespaces using regular expressions)
Line 195: Line 195:
 
for (i in ColNames) {
 
for (i in ColNames) {
 
dataframe[,i] <- factor(dataframe[,i])
 
dataframe[,i] <- factor(dataframe[,i])
 +
levels(dataframe[,i]) <- gsub(" *$", "",gsub("^ *", "", levels(dataframe[,i])))
 
if(sum(Encoding(levels(dataframe[,i]))=="latin1")!=0) levels(dataframe[,i]) <- iconv(levels(dataframe[,i]), "latin1", "UTF-8")
 
if(sum(Encoding(levels(dataframe[,i]))=="latin1")!=0) levels(dataframe[,i]) <- iconv(levels(dataframe[,i]), "latin1", "UTF-8")
 
}
 
}
Line 276: Line 277:
 
for (i in ColIds) {
 
for (i in ColIds) {
 
LocIdMap <- LocIds[LocIds$obj_id_i == i, 1]
 
LocIdMap <- LocIds[LocIds$obj_id_i == i, 1]
names(LocIdMap) <- tolower(LocIds[LocIds$obj_id_i == i, 3])
+
names(LocIdMap) <- gsub(" *$", "",gsub("^ *", "", tolower(LocIds[LocIds$obj_id_i == i, 3])))
 
levels(dataframe[, i]) <- LocIdMap[tolower(levels(dataframe[, i]))]
 
levels(dataframe[, i]) <- LocIdMap[tolower(levels(dataframe[, i]))]
 
if (sum(is.na(levels(dataframe[, i]))) != 0) stop("Faulty location matching. Usually caused by special characters.")
 
if (sum(is.na(levels(dataframe[, i]))) != 0) stop("Faulty location matching. Usually caused by special characters.")

Revision as of 10:45, 23 May 2011


Code for R for the purpose of interacting with the Opasnet Base is collected on this page. To use it, copy paste the code you need to the R console; this defines the functions, after which they can be called for in that R session. Or alternatively install the OpasnetBaseUtils package.

Package dependencies

These packages are required for most of the code to work. To install: from the top bar menu Packages>Install. To load: copy-paste.

library(utils)
library(RODBC)

Downloading data

Functions

op_baseGetData <- function(dsn, ident, include = NULL, exclude = NULL, series_id = NULL, iterations = NULL, use.utf8 = TRUE) {
	if (use.utf8) db <- odbcConnect(dsn, DBMSencoding = "UTF-8") else db <- odbcConnect(dsn)
	obj_id <- sqlQuery(db, paste('SELECT id FROM obj WHERE ident = "', ident, '"', sep=''))[1,1]
	if (length(series_id) == 0) {series_id <- sqlQuery(db, paste('SELECT series_id FROM actobj WHERE obj_id = ', obj_id, 
		' ORDER BY series_id DESC LIMIT 1', sep = ''))[1,1]}
	sliced <- FALSE
	locations <- NULL
	x <- 1
	basequery <- paste('SELECT loccell.cell_id FROM actobj LEFT JOIN cell ON actobj.id = cell.actobj_id LEFT JOIN', 
			' loccell ON cell.id = loccell.cell_id WHERE actobj.obj_id = ', obj_id, ' AND actobj.series_id = ', 
			series_id, ' AND loccell.loc_id IN(', sep = '')
	if (length(include) != 0) {
		sliced <- TRUE
		locations[x] <- paste("IN(", basequery, paste(include, collapse = ","), ")", sep = "")
		x <- x + 1
	}
	if (length(exclude) != 0) {
		sliced <- TRUE
		locations[x] <- paste("NOT IN(", basequery, paste(exclude, collapse = ","), ")", sep = "")
	}
	if (sliced == FALSE) {
		Data <- sqlQuery(db, paste('SELECT cell.id, res.obs, obj.ident AS ind, loc.location AS loc, res.result', 
			' FROM actobj LEFT JOIN cell ON actobj.id = cell.actobj_id LEFT JOIN res ON cell.id =', 
			' res.cell_id LEFT JOIN loccell ON cell.id = loccell.cell_id LEFT JOIN loc ON loccell.loc_id',
			' = loc.id LEFT JOIN obj ON loc.obj_id_i = obj.id WHERE actobj.obj_id = ', obj_id, 
			' AND actobj.series_id = ', series_id, if(length(iterations)==1){paste(" AND obs <= ", iterations, 
			sep = "")}, sep = '')) } else {
		Data <- sqlQuery(db, paste('SELECT cell.id, res.obs, obj.ident AS ind, loc.location AS loc, res.result', 
			' FROM actobj LEFT JOIN cell ON actobj.id = cell.actobj_id LEFT JOIN res ON cell.id =', 
			' res.cell_id LEFT JOIN loccell ON cell.id = loccell.cell_id LEFT JOIN loc ON loccell.loc_id',
			' = loc.id LEFT JOIN obj ON loc.obj_id_i = obj.id WHERE actobj.obj_id = ', obj_id, 
			' AND actobj.series_id = ', series_id, if(length(iterations)==1){paste(" AND obs <= ", iterations, 
			sep = "")}, ' AND (cell.id ', paste(locations, collapse = ') AND cell.id '), 
			'))', sep = ''))
	}
	odbcClose(db)
	Data <- Data[order(Data[,1], Data[,2], Data[,3]),]
	nind <- length(levels(Data[,3]))
	nres <- nrow(Data)/nind
	dataframe <- Data[1:nres*nind, c(1,2)]
	for (i in nind:1) {
		dataframe[,2 + nind - i + 1] <- factor(Data[1:nres*nind - i + 1, 4])
		colnames(dataframe)[2 + nind - i + 1] <- as.character(Data[nind - i + 1, 3])
	}
	dataframe[,3 + nind] <- Data[1:nres*nind, 5]
	colnames(dataframe)[3 + nind] <- "Result"
	rownames(dataframe) <- 1:nres
	return(dataframe)
}

Usage

variable <- op_baseGetData("opasnet_base", "page identifier", include = vector_of_loc_ids, exclude = vector_of_loc_ids)
  • Assuming "opasnet_base" is a correctly defined DSN (Data Service Name; in Windows XP: Control Panel\Administrative tools\Data Sources (ODBC)).
  • Include and exclude are optional.
    • Include picks all cells in the locations given.
      • The clearest case is when all the included locations belong to the same index: Any cells in the non-included locations of the index will be left out.
      • In case given locations are in multiple indices: The effect produced will be the same as picking separately for each index and removing duplicates.
    • Exclude unpicks any cells which are indexed by the locations given. Slower than include.
    • They can be used in unison.
  • Result will be in a table format with columns: id, obs, ind1, ind2 ... indn, Result.
  • series_id is an optional parameter, if it is not given the most current upload of the data will be downloaded.

Finding index data

Function

op_baseGetLocs <- function(dsn, ident, series_id = NULL, use.utf8 = TRUE) {
	if (use.utf8) db <- odbcConnect(dsn, DBMSencoding = "UTF-8") else db <- odbcConnect(dsn)
	obj_id <- sqlQuery(db, paste('SELECT id FROM obj WHERE ident = "', ident, '"', sep=''))[1,1]
	if (length(series_id) == 0) {series_id <- sqlQuery(db, paste('SELECT series_id FROM actobj WHERE obj_id = ', obj_id, 
		' ORDER BY series_id DESC LIMIT 1', sep = ''))[1,1]}
	Locs <- sqlQuery(db, paste("SELECT DISTINCT obj.ident AS ind, loc.location AS loc, loc.id AS loc_id", 
		" FROM actobj LEFT JOIN actloc ON actobj.id = actloc.actobj_id LEFT JOIN loc ON actloc.loc_id = loc.id", 
		" LEFT JOIN obj ON loc.obj_id_i = obj.id WHERE actobj.obj_id = ", obj_id, " AND actobj.series_id = ", 
		series_id, sep = ""))
	odbcClose(db)
	Locs <- Locs[order(Locs[,1]),]
	rownames(Locs) <- 1:nrow(Locs)
	return(Locs)
}
  • Returns all indexes and locations and their ids in a table of format: ind, loc, loc_id.

Manipulating data

Functions

DataframeToArray <- function(dataframe, rescol = NULL) {
	ColNames <- colnames(dataframe[(colnames(dataframe) == "id") == FALSE])
	if (length(ColNames[(ColNames == "obs")])>0) {if(length(levels(factor(dataframe[,"obs"]))) == 1) {ColNames <- ColNames[
	(ColNames == "obs") == FALSE]} else {dataframe[,"obs"] <- factor(as.character(dataframe[,"obs"]))}}
	if (length(rescol)==0) {
	rescol <- colnames(dataframe) == "Freq"
	if (length(rescol[rescol==TRUE]) >= 1) rescol <- "Freq" else {
		rescol <- colnames(dataframe) == "Result"
		if (length(rescol[rescol==TRUE]) >= 1) rescol <- "Result" else {
			rescol <- colnames(dataframe) == "result"
			if (length(rescol[rescol==TRUE]) >= 1) rescol <- "result" else stop("No result column found")
		}
	}
	} else {ColNames <- ColNames[(ColNames == rescol) == FALSE]}
	ColNames <- ColNames[(ColNames == "Result") == FALSE]
	ColNames <- ColNames[(ColNames == "result") == FALSE]
	ColNames <- ColNames[(ColNames == "Freq") == FALSE]
	nind <- length(ColNames)
	DimNames <- rep(vector("list", 1), nind)
	names(DimNames) <- ColNames
	indlengths <- 0
	for (i in 1:nind) {
		DimNames[[i]] <- levels(factor(dataframe[,ColNames[i]]))
		indlengths[i] <- length(DimNames[[i]])
	}
	array <- rep(NA, prod(indlengths))
	dim(array) <- indlengths
	dimnames(array) <- DimNames
	array[as.matrix(dataframe[,ColNames])] <- dataframe[,rescol]
	return(array)
}

Usage

variable2 <- DataframeToArray(variable1, rescol = NULL)
  • variable1 must be in similar format as the result when downloading.
  • Columns named "id" and various versions of "Result" are ignored for dimension creation.
    • "obs" column will also be ignored if there's only one.
  • The column containing the values may be defined in the parameters, otherwise it is assumed to be either "Freq", "Result" or "result" in that order.

Other useful stuff

as.data.frame(as.table(array))
dataframe[is.na(dataframe[,ncol(dataframe)])==FALSE,]
  • First line returns an array from a data frame.
  • Second line returns rows without empty values.
dataframe[grep("location", dataframe$index, ignore.case = TRUE),]
  • Returns all rows of a data frame where column "index" value is "location".
dataframe[order(dataframe[,"col1"],dataframe[,"col2"], ... ,dataframe[,"coln"]),]
  • Returns dataframe ordered by col1, col2, ... , coln.
read.csv("table.csv", sep = ",")
  • Returns a data.frame from a .csv file, sep is the separator used in the file.

Uploading data

Functions

op_baseWrite <- function(dsn, input, ident = NULL, name = NULL, unit = NULL, objtype_id = NULL, who = NULL, acttype = NULL, 
	rescol = NULL, n.obs.const = FALSE, maxrows = 50000, use.utf8 = TRUE) {
	
	# Coerce input into a data frame if it isn't one already; get rid of empty cells
	if (is.array(input)) dataframe <- as.data.frame(as.table(input)) else dataframe <- input
	if (is.null(rescol)) {
	rescol <- colnames(dataframe) == "Freq"
	if (sum(rescol) == 1) rescol <- "Freq" else {
		rescol <- colnames(dataframe) == "Result"
		if (sum(rescol) == 1) rescol <- "Result" else {
			rescol <- colnames(dataframe) == "result"
			if (sum(rescol) == 1) rescol <- "result"
		}
	}}
	dataframe <- dataframe[is.na(dataframe[,rescol]) == FALSE,]
	ColNames <- colnames(dataframe)[!(colnames(dataframe)%in%c(rescol, "id", "obs"))]
	for (i in ColNames) {
		dataframe[,i] <- factor(dataframe[,i])
		levels(dataframe[,i]) <- gsub(" *$", "",gsub("^ *", "", levels(dataframe[,i])))
		if(sum(Encoding(levels(dataframe[,i]))=="latin1")!=0) levels(dataframe[,i]) <- iconv(levels(dataframe[,i]), "latin1", "UTF-8")
	}
	
	#if(!is.numeric(dataframe[,rescol])) 
	
	# Open database connection
	if(use.utf8) db <- odbcConnect(dsn, DBMSencoding = "UTF-8") else db <- odbcConnect(dsn)
	
	# Add page to database (if it doesn't already exist)
	if (is.null(ident)) if (interactive()) ident <- readline(paste("What is the identifier of this object?", 
		"\n", sep = "")) else stop("indentifier of object no given")
	obj_id <- sqlQuery(db, paste('SELECT id FROM obj WHERE ident = "', ident, '"', sep = ''))[1,1]
	if (is.na(obj_id)) {
		
		# Wiki id
		if (substr(ident, 1,5)=="Op_en") {wiki_id <- 1; page <- substr(ident, 6, nchar(ident))} else {
		if (substr(ident, 1,5)=="Op_fi") {wiki_id <- 2; page <- substr(ident, 6, nchar(ident))} else {
		if (substr(ident, 1,6)=="Heande") {wiki_id <- 6; page <- substr(ident, 7, nchar(ident))} else {
		if (substr(ident, 1,4)=="Erac") {wiki_id <- 6; page <- substr(ident, 5, nchar(ident))} else {
		wiki_id <- 0; page <- 0; warning("No wiki id found in ident, writing zero.")}}}}
		page <- as.numeric(page)
		if (is.na(page)) stop("could not convert characters following the wiki ident into a page number")
		
		# Name etc.
		if (is.null(name)) if (interactive()) name <- readline(paste("What is the name of this object?", 
			"\n", sep = "")) else stop("object name not given")
		if (is.null(objtype_id)) if (interactive()) objtype_id <- readline(paste("What type of object is", 
			" this (id)?", paste(paste(sqlQuery(db, "SELECT id FROM objtype")[,1], sqlQuery(db, paste("SELECT objtype",
			" FROM objtype", sep = ""))[,1], sep = " - "), collapse = ", "), "\n", collapse = " ")) else {
			stop("object type not given")}
		sqlQuery(db, paste('INSERT INTO obj (ident, name, objtype_id, page, wiki_id) VALUES ("', paste(ident, 
			name, sep = '","'), '",', paste(objtype_id, page, wiki_id, sep = ','), ')', sep = ''))
		obj_id <- sqlQuery(db, paste('SELECT id FROM obj WHERE ident = "', ident, '"', sep = ''))[1,1]
	}
	
	# Write act and actobj
	if (is.null(who)==TRUE) if (interactive()) {who <- readline(paste("What is the name of the uploader?", "\n", sep = "")) 
		} else stop("uploader name not given")
	series_id <- sqlQuery(db, paste("SELECT series_id FROM actobj WHERE obj_id = ", obj_id, " ORDER BY series_id DESC LIMIT 1", 
		sep = ""))[1,1]
	if (is.na(series_id)==FALSE) {if (is.null(acttype)==TRUE) {if (interactive()) {acttype <- readline(paste("What type of upload", 
		" is this? 4 - new data to replace any existing, 5 - new data to be appended to existing data (must have the same", 
		" indices).", "\n", sep = "")) 
		} else acttype <- 4 
		}} else acttype <- 4
	if (!(acttype%in%c(4,5))) stop ("proper acttype not given")
	
	sqlQuery(db, paste('INSERT INTO act (acttype_id, who, comments) VALUES (', acttype, ',"', who, '","R upload")', sep = ''))
	act_id <- sqlQuery(db, paste('SELECT id FROM act WHERE who = "', who,'" AND comments = "R upload" ORDER BY id DESC LIMIT 1', 
		sep = ''))[1,1]
	if (acttype == 4) series_id <- act_id
	if (is.null(unit)) if (interactive()) unit <- readline(paste("What is the unit of this object?", 
		"\n", sep = "")) else stop("unit not given")
	sqlQuery(db, paste('INSERT INTO actobj (act_id, obj_id, series_id, unit) VALUES (', paste(act_id, obj_id, series_id, sep = ','), 
		',"', unit, '")', sep = ''))
	actobj_id <- sqlQuery(db, paste('SELECT id FROM actobj WHERE act_id = ', act_id, sep = ''))[1,1]
	
	#Write indexes
	for (i in ColNames) {
		sqlQuery(db, paste('INSERT IGNORE INTO obj (ident, name, objtype_id) VALUES ("', gsub(' ', '_', i), '","', 
			i, '", 6)', sep = ''))
	}
	IndIds <- sqlQuery(db, paste('SELECT id, ident FROM obj WHERE ident IN("', paste(gsub(" ", "_", ColNames), 
		collapse = '","'), '")', sep = ''))
	IndIdMap <- IndIds$id
	names(IndIdMap) <- tolower(IndIds$ident)
	ColIds <- as.character(IndIdMap[tolower(gsub(" ", "_", ColNames))])
	colnames(dataframe)[colnames(dataframe)%in%ColNames] <- ColIds
	
	#Write locations
	for (i in ColIds) {
		for (j in levels(dataframe[, i])) {
			sqlQuery(db, paste('INSERT IGNORE INTO loc (obj_id_i, location) VALUES (', i, ',"', j, '")', 
				sep = ''))
		}
	}
	LocIds <- sqlQuery(db, paste('SELECT id, obj_id_i, location FROM loc WHERE obj_id_i IN("', paste(ColIds, collapse = '","'), 
		'")', sep = ''))

	for (i in ColIds) {
		LocIdMap <- LocIds[LocIds$obj_id_i == i, 1]
		names(LocIdMap) <- gsub(" *$", "",gsub("^ *", "", tolower(LocIds[LocIds$obj_id_i == i, 3])))
		levels(dataframe[, i]) <- LocIdMap[tolower(levels(dataframe[, i]))]
		if (sum(is.na(levels(dataframe[, i]))) != 0) stop("Faulty location matching. Usually caused by special characters.")
		
		#Writing actloc
		sqlQuery(db, paste("INSERT INTO actloc (actobj_id, loc_id) VALUES (", paste(actobj_id, levels(dataframe[, i]), 
		sep = ",", collapse = "),("), ")", sep = ""))
	}
	
	#Writing cell
	if (is.numeric(dataframe[,rescol])) {
		if (sum(colnames(dataframe) == "obs") == 1) {
			n <- length(levels(factor(dataframe$obs)))
			if (n > 1) {
				if (n.obs.const) {ncell <- nrow(dataframe)/n} else {
					n <- tapply(dataframe[,rescol], dataframe[,ColIds], length)
					ncell <- sum(n, na.rm = TRUE)
				}
			} else {ncell <- nrow(dataframe)}
		} else {n <- 1; ncell <- nrow(dataframe)}
	} else n <- tapply(dataframe[,rescol], dataframe[,ColIds], length) # for textual data, this is used for dimension finding
	if (is.numeric(dataframe[,rescol])) means <- tapply(dataframe[,rescol], dataframe[,ColIds], mean) else means <- rep(0, ncell)
	if (is.numeric(dataframe[,rescol])) {
		sds <- tapply(dataframe[,rescol], dataframe[,ColIds], sd); sds[] <- ifelse(n == 1, 0, sds)} else sds <- rep(0, ncell)
	
	cellQuery <- paste(actobj_id, means[!is.na(means)], sds[!is.na(sds)], n[!is.na(n)], sep = ",")
	i <- 1
	while (length(cellQuery) >= (i + maxrows - 1)) {
		sqlQuery(db, paste('INSERT INTO cell (actobj_id, mean, sd, n) VALUES (', paste(cellQuery[i:(i + maxrows - 1)], 
			collapse = '),('), ')', sep = ''))
		i <- i + maxrows
	}
	if (length(cellQuery) %% maxrows != 0) {
		sqlQuery(db, paste('INSERT INTO cell (actobj_id, mean, sd, n) VALUES (', 
			paste(cellQuery[i:length(cellQuery)], collapse = '),('), ')', sep = ''))
	}
	
	#Writing res
	cell_id <- sqlQuery(db, paste("SELECT id FROM cell WHERE actobj_id = ", actobj_id, " ORDER BY ID", sep = ""))[,1]
	if (length(cell_id) != ncell) stop("number of written cells differs from given data")
	if (is.numeric(dataframe[,rescol])) ids <- means else ids <- n
	ids[!is.na(ids)] <- cell_id
	dataframe[, ncol(dataframe) + 1] <- ids[as.matrix(dataframe[,ColIds])]
	colnames(dataframe)[ncol(dataframe)] <- "cell_id"
	
	resQuery <- paste(dataframe[,"cell_id"], ',', if(sum(colnames(dataframe) == "obs") == 0) 1 else dataframe[,"obs"], ',', 
		if (!is.numeric(dataframe[,rescol])) '"', dataframe[,rescol], if (!is.numeric(dataframe[,rescol])) '"', sep = "")
	i <- 1
	while (length(resQuery) >= (i + maxrows - 1)) {
		sqlQuery(db, paste('INSERT INTO res (cell_id, obs, ', ifelse(is.numeric(dataframe[,rescol]), "result", "restext"), ') VALUES (', 
			paste(resQuery[i:(i + maxrows - 1)], collapse = '),('), ')', sep = ''))
		i <- i + maxrows
	}
	if (length(resQuery) %% maxrows != 0) {
		sqlQuery(db, paste('INSERT INTO res (cell_id, obs, ', ifelse(is.numeric(dataframe[,rescol]), "result", "restext"), ') VALUES (', 
			paste(resQuery[i:length(resQuery)], collapse = '),('), ')', sep = ''))
	}
	
	#Writing loccell
	ids <- as.data.frame(as.table(ids))
	ids <- ids[!is.na(ids$Freq),]
	loccellQuery <- paste(ids$Freq, unlist(ids[,-ncol(ids)]), sep = ",")
	i <- 1
	while (length(loccellQuery) >= (i + maxrows - 1)) {
		sqlQuery(db, paste('INSERT INTO loccell (cell_id, loc_id) VALUES (', paste(loccellQuery[i:(i + maxrows - 1)], collapse = '),('), ')', 
			sep = ''))
		i <- i + maxrows
	}
	if (length(loccellQuery) %% maxrows != 0) {
		sqlQuery(db, paste('INSERT INTO loccell (cell_id, loc_id) VALUES (', paste(loccellQuery[i:length(loccellQuery)], collapse = '),('), ')', 
			sep = ''))
	}
	
	#Close database connection
	odbcClose(db)
	cat("Successful\n")
	return(character())
}

Usage

op_baseWrite(dsn, input, ident = NULL, name = NULL, unit = NULL, objtype_id = NULL, who = NULL, acttype = NULL)
  • dsn and input must be defined, the rest of the object and act parameters if not defined are prompted for by the function as needed.
    • For uploading the DSN defined must have writers permissions.
Restrictions
  • Input may only be given in either array or data.frame form.
    • Indexes used may not exceed the character limit of 20.
      • Indexes should preferably match an earlier entry: Special:OpasnetBaseIndices.
      • Indexes are treated as identifiers for indexes in the database, spaces in the indexes are converted to _. This ensures maximum compatibility and ease in operations in which data is downloaded and uplaoded again. Names and more specific details can be edited into the indexes separately.
    • Input in data.frame form must contain a "Freq", "Result" or "result" column, this is where the numerical value is read.
    • Probabilistic input must contain a dimension (in case of array) or a column (in case of data.frame) named "obs".