Difference between revisions of "Input.interp"
(→Answer) |
(→Rcode) |
||
Line 55: | Line 55: | ||
<rcode name="answer"> | <rcode name="answer"> | ||
+ | library(triangle) | ||
+ | |||
# Lognormal distribution parametrization functions | # Lognormal distribution parametrization functions | ||
lmean <- function(parmean, parsd) {return(log(parmean)-log(1+(parsd^2)/(parmean^2))/2)} | lmean <- function(parmean, parsd) {return(log(parmean)-log(1+(parsd^2)/(parmean^2))/2)} | ||
Line 62: | Line 64: | ||
interpf <- function( | interpf <- function( | ||
n, | n, | ||
− | |||
res.char, | res.char, | ||
brackets.pos, | brackets.pos, | ||
brackets.length, | brackets.length, | ||
− | minus, minus.length, | + | minus, |
+ | minus.length, | ||
minus.exists, | minus.exists, | ||
plusminus, | plusminus, | ||
plusminus.length, | plusminus.length, | ||
− | plusminus.exists | + | plusminus.exists, |
+ | doublePoint | ||
) { | ) { | ||
− | if(brackets.pos | + | if(doublePoint[1] > 0) { |
− | minus.relevant <- unlist(minus)[(cumsum(c(0, minus.length)) + 1) | + | tempArgs <- sort(as.numeric(unlist(strsplit(res.char, "\\:")))) |
− | n.minus.inside.brackets <- sum(minus.relevant > brackets.pos | + | return(rtriangle(n,tempArgs[1],tempArgs[3],tempArgs[2])) |
− | imean <- as.numeric(substr(res.char | + | } |
+ | if(brackets.pos >= 0) { | ||
+ | minus.relevant <- unlist(minus)[(cumsum(c(0, minus.length)) + 1):cumsum(minus.length)] | ||
+ | n.minus.inside.brackets <- sum(minus.relevant > brackets.pos & minus.relevant < brackets.pos + brackets.length) | ||
+ | imean <- as.numeric(substr(res.char, 1, brackets.pos - 1)) | ||
if(n.minus.inside.brackets == 1) { | if(n.minus.inside.brackets == 1) { | ||
− | ici <- c(as.numeric(substr(res.char | + | ici <- c(as.numeric(substr(res.char, brackets.pos + 1, minus.relevant[minus.relevant > brackets.pos] - 1)), as.numeric(substr(res.char, |
− | minus.relevant[minus.relevant > brackets.pos | + | minus.relevant[minus.relevant > brackets.pos] + 1, brackets.pos + brackets.length - 2))) |
isd <- sum(abs(ici - imean) / 2) / qnorm(0.975) | isd <- sum(abs(ici - imean) / 2) / qnorm(0.975) | ||
if((ici[2] - imean) / (ici[1] - imean) < 1.5) { | if((ici[2] - imean) / (ici[1] - imean) < 1.5) { | ||
Line 87: | Line 94: | ||
} else | } else | ||
if(n.minus.inside.brackets %in% c(2,3)) { | if(n.minus.inside.brackets %in% c(2,3)) { | ||
− | + | ici <- c(as.numeric(substr(res.char, brackets.pos + 1, minus.relevant[minus.relevant > brackets.pos][2] - 1)), as.numeric(substr(res.char, | |
− | ici <- c(as.numeric(substr(res.char | + | minus.relevant[minus.relevant > brackets.pos][2] + 1, brackets.pos + brackets.length - 2))) |
− | minus.relevant[minus.relevant > brackets.pos | ||
isd <- sum(abs(ici - imean) / 2) / qnorm(0.975) | isd <- sum(abs(ici - imean) / 2) / qnorm(0.975) | ||
return(rnorm(n, imean, isd)) | return(rnorm(n, imean, isd)) | ||
} | } | ||
− | warning(paste("Unable to interpret \"", res.char | + | warning(paste("Unable to interpret \"", res.char, "\"", sep = "")) |
return(NA) | return(NA) | ||
} | } | ||
− | if(minus.exists | + | if(minus.exists) { |
− | minus.relevant <- unlist(minus)[(cumsum(c(0, minus.length)) + 1) | + | minus.relevant <- unlist(minus)[(cumsum(c(0, minus.length)) + 1):cumsum(minus.length)] |
if(length(minus.relevant) == 1) { | if(length(minus.relevant) == 1) { | ||
− | if(as.numeric(substr(res.char | + | if(as.numeric(substr(res.char, 1, minus.relevant - 1)) / as.numeric(substr(res.char, minus.relevant + 1, nchar(res.char))) >= 1/100) { |
− | return(runif(n, as.numeric(substr(res.char | + | return(runif(n, as.numeric(substr(res.char, 1, minus.relevant - 1)), as.numeric(substr(res.char, minus.relevant + 1, nchar(res.char[i]))))) |
} else { | } else { | ||
− | return(exp(runif(n, log(as.numeric(substr(res.char | + | return(exp(runif(n, log(as.numeric(substr(res.char, 1, minus.relevant - 1))), log(as.numeric(substr(res.char, minus.relevant + 1, nchar(res.char))))))) |
} | } | ||
} | } | ||
if(length(minus.relevant) %in% c(2,3)) { | if(length(minus.relevant) %in% c(2,3)) { | ||
− | return(runif(n, as.numeric(substr(res.char | + | return(runif(n, as.numeric(substr(res.char, 1, minus.relevant[2] - 1)), as.numeric(substr(res.char, minus.relevant[2] + 1, nchar(res.char))))) |
} | } | ||
} | } | ||
− | if(plusminus.exists | + | if(plusminus.exists) { |
− | return(rnorm(n, as.numeric(substr(res.char | + | return(rnorm(n, as.numeric(substr(res.char, 1, plusminus[1] - 1)), as.numeric(substr(res.char, plusminus[1] + 1, nchar(res.char))))) |
} | } | ||
− | if(sum(unlist(strsplit(res.char | + | if(sum(unlist(strsplit(res.char, ""))==";") > 0) { |
− | return(sample(sapply(strsplit(res.char | + | return(sample(sapply(strsplit(res.char, ";"), as.numeric), N, replace = TRUE)) |
} | } | ||
− | warning(paste("Unable to interpret \"", res.char | + | warning(paste("Unable to interpret \"", res.char, "\"", sep = "")) |
return(NA) | return(NA) | ||
} | } | ||
Line 132: | Line 138: | ||
brackets.length <- as.numeric(unlist(sapply(brackets, attributes)[1,])) | brackets.length <- as.numeric(unlist(sapply(brackets, attributes)[1,])) | ||
brackets.pos <- unlist(brackets) | brackets.pos <- unlist(brackets) | ||
+ | doublePoint <- gregexpr(":", res.char) | ||
out <- list() | out <- list() | ||
for(i in 1:length(res.char)) { | for(i in 1:length(res.char)) { | ||
− | out[[i]] <- interpf(n | + | out[[i]] <- interpf(n, res.char[i], brackets.pos[i], brackets.length[i], minus[i], minus.length[i], minus.exists[i], plusminus[[i]], |
− | plusminus.length, plusminus.exists) | + | plusminus.length[i], plusminus.exists[i],doublePoint[[i]]) |
} | } | ||
out | out | ||
Line 152: | Line 159: | ||
out$Interp.Result <- unlist(temp) | out$Interp.Result <- unlist(temp) | ||
dim(temp.lengths) <- length(temp.lengths) | dim(temp.lengths) <- length(temp.lengths) | ||
− | out$ | + | out$Iter<- c(apply(temp.lengths, 1, iter.f)) |
out | out | ||
} | } | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
===Independent or not?=== | ===Independent or not?=== |
Revision as of 10:48, 14 June 2012
This page is a method.
The page identifier is Op_en5364 |
---|
Moderator:Jouni (see all) |
This page is a stub. You may improve it into a full page, and then a rating bar will appear here. |
Upload data
|
input.interp is an R function that interprets model inputs from a user-friendly format into explicit and exact mathematical format. The purpose is to make it easy for a user to give input without a need to worry about technical modelling details.
Question
What should be a list of important user input formats, and how should they be interpreted?
Answer
The basic feature is that if a text string can be converted to a meaningful numeric object, it will be. This function can be used when data is downloaded from Opasnet Base: if Result.Text contains this kind of numeric information, it is converted to numbers and fused with Result.
n is the number of iterations in the model. # is any numeric character in the text string.
Example | Regular expression | Interpretation | Output in R |
---|---|---|---|
12 000 | # # | 12000. Text is interpreted as number if space removal makes it a number. | as.numeric(gsub(" ", "", Result.text)) |
-14,23 | -# | -14.23. Minus in the beginning of entry is interpreted as minus, not a sign for a range. | ⇤# : Not needed. See above. --Jouni 15:30, 16 April 2012 (EEST) |
50 - 125 | # - # | Uniform distribution between 50 and 125 | data.frame(iter=1:n, result=runif(n,50,125)) |
-12 345 - -23.56 | -# - -# | Uniform distribution between -12345 and -23.56. | |
1 - 50 | # - # | Loguniform distribution between 1 and 50 (Lognormality is assumed if the ratio of upper to lower is > 100) | |
3.1 ± 1.2 or 3.1 +- 1.2 | # ± # or # +- # | Normal distribution with mean 3.1 and SD 1.2 | data.frame(iter=1:n, result=rnorm(n,3.1,1.2)) |
2.4 (1.8 - 3.0) | # (# - #) | Normal distribution with mean 2.4 and 95 % confidence interval from 1.8 to 3.0 | data.frame(iter=1:n, result=rnorm(n,2.4,(3.0-1.8)/2/1.96)) |
2.4 (2.0 - 3.2) | # (# - #) | Lognormal distribution with mean 2.4 and 95 % confidence interval from 2.0 to 3.0. Lognormality is assumed if the difference from mean to upper limit is => 50 % greater than from mean to lower limit. | |
2;4;7 | Each entry (2, 4, and 7 in this case) are equally likely to occur. Entries can also be text. | ||
(0,0.5,1) | (#,#,#) | Triangular distribution | |
* (in index, or explanatory, columns) | The result applies to all locations of this index. | With merge() function, this column is not used as a criterion when these rows are merged. |
How to actually make this happen in R?
- Make a temporary result temp by removing all spaces from Result.Text. Columns: Indices,Result.Result.Text,temp (Indices contains all explanatory columns.)
- Replace all "," with "."
- Check if there are parentheses "()". If yes, assume that they contain 95 % CI.
- Check if there are ranges "#-#".
- Divide the rows of the data.frame into two new data.frames with the same list of columns (Indices,Result).
- If temp is a syntactically correct distribution, take the row to data.frame A and replace Result with temp.
- Otherwise, take the row to data.frame B and replace Result with Result.Text if that is not NA.
- Create a new data.frame with index Iter = 1:n.
- Make a random sample from each probability distribution in data.frame A using Iter.
- Merge the data.frame B with Iter.
- Join data.frames A and B with rbind(). Columns: Iter,Index,Result.
Rcode
<rcode name="answer"> library(triangle)
- Lognormal distribution parametrization functions
lmean <- function(parmean, parsd) {return(log(parmean)-log(1+(parsd^2)/(parmean^2))/2)} lsd <- function(parmean, parsd) {return(log(1+(parsd^2)/(parmean^2)))}
- Actual interpretation function. Takes already pre-processed information and returns a distribution.
interpf <- function( n, res.char, brackets.pos, brackets.length, minus, minus.length, minus.exists, plusminus, plusminus.length, plusminus.exists, doublePoint ) { if(doublePoint[1] > 0) { tempArgs <- sort(as.numeric(unlist(strsplit(res.char, "\\:")))) return(rtriangle(n,tempArgs[1],tempArgs[3],tempArgs[2])) } if(brackets.pos >= 0) { minus.relevant <- unlist(minus)[(cumsum(c(0, minus.length)) + 1):cumsum(minus.length)] n.minus.inside.brackets <- sum(minus.relevant > brackets.pos & minus.relevant < brackets.pos + brackets.length) imean <- as.numeric(substr(res.char, 1, brackets.pos - 1)) if(n.minus.inside.brackets == 1) { ici <- c(as.numeric(substr(res.char, brackets.pos + 1, minus.relevant[minus.relevant > brackets.pos] - 1)), as.numeric(substr(res.char, minus.relevant[minus.relevant > brackets.pos] + 1, brackets.pos + brackets.length - 2))) isd <- sum(abs(ici - imean) / 2) / qnorm(0.975) if((ici[2] - imean) / (ici[1] - imean) < 1.5) { return(rnorm(n, imean, isd)) } else { return(outi <- rlnorm(n, lmean(imean, isd), lsd(imean, isd))) # menee vaarin koska isd on laskettu normaalijakaumalle } } else if(n.minus.inside.brackets %in% c(2,3)) { ici <- c(as.numeric(substr(res.char, brackets.pos + 1, minus.relevant[minus.relevant > brackets.pos][2] - 1)), as.numeric(substr(res.char, minus.relevant[minus.relevant > brackets.pos][2] + 1, brackets.pos + brackets.length - 2))) isd <- sum(abs(ici - imean) / 2) / qnorm(0.975) return(rnorm(n, imean, isd)) } warning(paste("Unable to interpret \"", res.char, "\"", sep = "")) return(NA) } if(minus.exists) { minus.relevant <- unlist(minus)[(cumsum(c(0, minus.length)) + 1):cumsum(minus.length)] if(length(minus.relevant) == 1) { if(as.numeric(substr(res.char, 1, minus.relevant - 1)) / as.numeric(substr(res.char, minus.relevant + 1, nchar(res.char))) >= 1/100) { return(runif(n, as.numeric(substr(res.char, 1, minus.relevant - 1)), as.numeric(substr(res.char, minus.relevant + 1, nchar(res.char[i]))))) } else { return(exp(runif(n, log(as.numeric(substr(res.char, 1, minus.relevant - 1))), log(as.numeric(substr(res.char, minus.relevant + 1, nchar(res.char))))))) } } if(length(minus.relevant) %in% c(2,3)) { return(runif(n, as.numeric(substr(res.char, 1, minus.relevant[2] - 1)), as.numeric(substr(res.char, minus.relevant[2] + 1, nchar(res.char))))) } } if(plusminus.exists) { return(rnorm(n, as.numeric(substr(res.char, 1, plusminus[1] - 1)), as.numeric(substr(res.char, plusminus[1] + 1, nchar(res.char))))) } if(sum(unlist(strsplit(res.char, ""))==";") > 0) { return(sample(sapply(strsplit(res.char, ";"), as.numeric), N, replace = TRUE)) } warning(paste("Unable to interpret \"", res.char, "\"", sep = "")) return(NA) }
- The next function processes character strings and loops the interpretation function.
input.interp <- function(res.char, n = 1000) { res.char <- gsub(" ", "", res.char) res.char <- gsub(",", ".", res.char) plusminus <- gregexpr(paste("\\+-|", rawToChar(as.raw(177)), sep = ""), res.char) # saattaa osoittautua ongelmaksi enkoodauksen vuoksi plusminus.length <- sapply(plusminus, length) plusminus.exists <- unlist(plusminus)[cumsum(c(0, plusminus.length[-length(plusminus.length)])) + 1] > 0 minus <- gregexpr("-", res.char) minus.length <- sapply(minus, length) minus.exists <- unlist(minus)[cumsum(c(0, minus.length[-length(minus.length)])) + 1] > 0 brackets <- gregexpr("\\(.*\\)", res.char) # matches for brackets "(...)" brackets.length <- as.numeric(unlist(sapply(brackets, attributes)[1,])) brackets.pos <- unlist(brackets) doublePoint <- gregexpr(":", res.char) out <- list() for(i in 1:length(res.char)) { outi <- interpf(n, res.char[i], brackets.pos[i], brackets.length[i], minus[i], minus.length[i], minus.exists[i], plusminusi, plusminus.length[i], plusminus.exists[i],doublePointi) } out }
- Assisting function for data.frame wrapper.
iter.f <- function(x) { 1:x }
- Data.frame wrapper for the functions.
input.interp.df <- function(idata, rescol = "Result.text", N = 1000) { temp <- input.interp(idata[, rescol], N) temp.lengths <- sapply(temp, length) out <- idata[rep(1:nrow(idata), times = temp.lengths),] out$Interp.Result <- unlist(temp) dim(temp.lengths) <- length(temp.lengths) out$Iter<- c(apply(temp.lengths, 1, iter.f)) out }
Independent or not?
Specific character string can be converted to distributions. There are two ways to do this, with parameter independent = TRUE in the first case, and FALSE in the second one:
- The character strings are interpreted one row at a time, and each row is made an independent distribution.
- The character strings are treated as a factor. The levels of the factor are converted to distributions. Therefore, all rows that have the same character string will have the an identical (not independent) distribution.