+ Show code- Hide code
# This is code Op_en7749/bayes on page [[Goherr: Fish consumption study]]
library(OpasnetUtils)
library(ggplot2)
library(reshape2)
library(rjags)
library(car)
# Fish intake in humans
# Data from data.frame survey from page [[Goherr: Fish consumption study]]
# Start with salmon questions 46:49 (amounts eaten)
# Some preprocessing should be moved away from this code.
# Model assumes that questionnaire data follows binomial distribution with parameter p and max value levels.
# p depends on country (4 groups), sex (2), age (2) that are known individually.
# We should test whether sex makes a difference or whether it can be omitted.
# Predicted p for each question, determined by the group, are produced from the bayes model.
# p is used to sample answers, which are then combined with estimates of amounts related to each answer.
# Total amount eaten of each fish per modelled individual is finally calculated.
objects.latest("Op_en7749", "preprocess") # [[Goherr: Fish consumption study]]: survey, surcol
agel <- as.character(unique(survey$Ages))
countryl <- sort(as.character(unique(survey$Country)))
genderl <- sort(as.character(unique(survey$Gender)))
fisl <- c("Salmon", "Herring")
# Interesting fish eating questions
surv <- survey[c(1,3,158,16,29,30,31,46:49,86,95:98)]
colnames(surv)
#[1] "Country" "Gender"
#[3] "Ages" "Fish eating"
#[5] "How often eat fish" "Salmon eating"
#[7] "Baltic salmon" "How often Baltic salmon"
#[9] "How much Baltic salmon" "How often side Baltic salmon"
#[11] "How much side Baltic salmon" "Eat Baltic herring"
#[13] "How often Baltic herring" "How much Baltic herring"
#[15] "How often side Baltic herring" "How much side Baltic herring"
surv$Ages <- match(surv$Ages, agel) # Not a factor, coerce to integer
surv <- as.data.frame(lapply(surv, FUN = function(x) as.integer(x))) # Coerce to integers
surv[is.na(surv[[12]]) | surv[[12]] == 3 , 12] <- 1 # Eat Baltic herring: I don't know --> No
surv[4:16] <- lapply(surv[4:16], FUN = function(x) x-1) # Scale to start from zero
# Row numbers for respondents that have eaten fish, Baltic salmon, and Baltic herring
eatfish <- (1:nrow(surv))[surv[[4]] %in% 1]
eatsalm <- (1:nrow(surv))[!(surv[[7]] %in% 0 | is.na(surv[[7]]))]
eatherr <- (1:nrow(surv))[surv[[12]] %in% 1 & !is.na(rowSums(surv[13:16]))]
# Oletetaan, että covarianssimatriisi on vakio kaikille maille ja sukupuolille yms
# mutta keskiarvo on spesifi näille ja kysymykselle.
qlen <- c(4,2,2,2,6,2,2,7,7,7,5,2,7,7,7,5) # Number of options in each question
agel
countryl
genderl
fisl
#################################################
cat("Version with both binomial and multivariate normal.\n")
ggplot(data.frame(A=1:2, B=1:2), aes(A, y=B))+geom_point()+
labs(title="Version with both binomial and multivariate normal.")
mod <- textConnection("
model{
for(i in 1:S) {
for(j in 1:4) {
surv[i,j] ~ dbin(p[j], qlen[j])
}
surv.scaled[i,1:4] ~ dmnorm(mu0[], Omega[,])
}
for(j in 1:4) {
p[j] ~ dbeta(1,1)
}
# mu[1:4] ~ dmnorm(mu0[1:4], S2[1:4,1:4])
Omega[1:4,1:4] ~ dwish(S3[1:4,1:4],4)
}
")
jags <- jags.model(
mod,
data = list(
surv = surv[eatherr,13:16],
surv.scaled = scale(surv[eatherr,13:16]),
qlen = qlen[13:16],
S = length(eatherr),
mu0 = rep(0,4),
# S2 = diag(4)/100000,
S3 = diag(4)/10000
),
n.chains = 4,
n.adapt = 100
)
update(jags, 100)
samps.j <- jags.samples(jags, c('Omega', 'p'), 1000)
j <- array(
samps.j$p,
dim = c(4,1000,4),
dimnames = list(Question = 1:4, Iter = 1:1000, Seed = 1:4)
)
scatterplotMatrix(t(j[,,1]))
j.melt <- melt(j)
ggplot(j.melt, aes(x = Iter, y = value, colour = as.character(Seed)))+
geom_line()+facet_wrap(~ Question)
j.Omega <- array(
samps.j$Omega,
dim = c(4,4,1000,4),
dimnames = list(Q1 = 1:4, Q2 = 1:4, Iter = 1:1000, Seed = 1:4)
)
scatterplotMatrix(t(j.Omega[1,,,1]))
j.Omega.melt <- melt(j.Omega)
ggplot(j.Omega.melt, aes(x = Iter, y = value, colour = as.character(Seed)))+
geom_line()+facet_grid(Q1 ~ Q2, scales = "free_y")
samps.c <- coda.samples(jags, c('Omega','p'), 1000)
plot(samps.c)
############################################################
cat("Version with multivariate normal.\n")
ggplot(data.frame(A=1:2, B=1:2), aes(A, y=B))+geom_point()+
labs(title="Version with multivariate normal.")
mod <- textConnection("
model{
for(i in 1:S) {
surv[i,1:4] ~ dmnorm(mu[], Omega[,])
}
mu[1:4] ~ dmnorm(mu0[1:4], S2[1:4,1:4])
Omega[1:4,1:4] ~ dwish(S3[1:4,1:4],4)
}
")
jags <- jags.model(
mod,
data = list(
surv = surv[eatherr,13:16],
S = length(eatherr),
mu0 = rep(0.5,4),
S2 = diag(4)/100000,
S3 = diag(4)/10000
),
n.chains = 4,
n.adapt = 100
)
update(jags, 100)
samps.j <- jags.samples(jags, c('mu','Omega', 'z'), 1000)
j <- array(
samps.j$mu,
dim = c(4,1000,4),
dimnames = list(Question = 1:4, Iter = 1:1000, Seed = 1:4)
)
scatterplotMatrix(t(j[,,1]))
samps.c <- coda.samples(jags, c('mu','Omega','z'), 1000)
plot(samps.c)
###########################################################
cat("Version with multivariate normal and fractile conversion.\n")
ggplot(data.frame(A=1:2, B=1:2), aes(A, y=B))+geom_point()+
labs(title="Version with multivariate normal and fractile conversion.")
# Assume linearity in the responses, but it is artificially binned into survey answers.
# Assume that answers are uncorrelated with a subgroup of respondents that have
# answered in the same way to two questions. So, for each answer column d, you can
#1 shuffle the vector d (keeping in mind the original order)
#2 sort it based on the answer
#3 renumber it by (1:length(d) - 0.5)/length(d) to get a fractile from 0..1.
#4 transform it to normal distribution by qnorm(d)
#5 reorder d to the original order
# Then, use multivariate normal distribution to estimate parameters.
fractalise <- function(d) {
f <- sample(seq_along(d)) # Create a random order for data
g <- order(d[f]) # First randomorder and then put in order by size
h <- qnorm((seq_along(d)-0.5)/length(d)) # 'z scores' of data points
return(h[order(g)][order(f)]) # Reorder fractiles to original order
}
surv.f <- surv[1:nrow(surv) %in% eatherr & !is.na(rowSums(surv[13:16])) , ]
surv.f[13:16] <- lapply(surv.f[13:16], fractalise)
mod <- textConnection("
model{
for(i in 1:S) {
surv.f[i,1:4] ~ dmnorm(mu[], Omega[,])
}
for(j in 1:4) {
mu[j] ~ dnorm(0, 3)
}
Omega[1:4,1:4] ~ dwish(S3[1:4,1:4],S)
surv.pred[1:4] ~ dmnorm(mu[1:4], Omega[1:4,1:4])
Opri[1:4,1:4] ~ dwish(S3[1:4,1:4],S)
}
")
jags <- jags.model(
mod,
data = list(
surv.f = surv.f,
S = nrow(surv.f),
# mu0 = rep(0.5,4), # hyperprior for mean
# S2 = diag(4)/100000, # hyperprior for mean
S3 = diag(4)*5#/10000 # hyperprior for covariance table
),
n.chains = 4,
n.adapt = 100
)
update(jags, 100)
samps.j <- jags.samples(jags, c('mu','Omega', 'surv.pred', 'Opri'), 1000)
j <- array(
samps.j$surv.pred,
dim = c(4,1000,4),
dimnames = list(Question = 1:4, Iter = 1:1000, Seed = 1:4)
)
oprint(cor(t(samps.j$surv.pred[,,1])))
oprint(cor(surv.f[13:16]))
scatterplotMatrix(t(j[,,1]))
scatterplotMatrix(surv.f[13:16])
j2 <- array(
samps.j$Opri,
dim = c(4,4,1000,4),
dimnames = list(Q1 = 1:4, Q2 = 1:4, Iter = 1:1000, Seed = 1:4)
)
j2.cov <- apply(j2[,,,1], MARGIN = 1:2, FUN = mean)
oprint(j2.cov)
scatterplotMatrix(t(j2[1,,,1]))
samps.c <- coda.samples(jags, c('mu','Omega', 'surv.pred'), 1000)
plot(samps.c)
################################################################3
cat("MODEL WITH BINOMIAL DISTRIBUTION ASSUMPTION.\n")
ggplot(data.frame(A=1:2, B=1:2), aes(A, y=B))+geom_point()+
labs(title="Version with binomial distribution assumption.")
# Interesting fish eating questions
surv <- survey[c(1,3,4,16,29,30,31,46:49,86,95:98,158)]
colnames(surv)
consume <- data.matrix(survey[c(
"Baltic salmon",
"Eat Baltic herring"
)])
consume <- 1 - (consume!=2 | is.na(consume)) # Yes=1, other=0
agel <- as.character(unique(survey$Ages))
countryl <- sort(as.character(unique(survey$Country)))
genderl <- sort(as.character(unique(survey$Gender)))
fisl <- c("Salmon", "Herring")
ans.sal <- survey[
grep("Yes", survey[["Baltic salmon"]]) ,
c(
"How often Baltic salmon",
"How much Baltic salmon",
"How often side Baltic salmon",
"How much side Baltic salmon",
"Ages",
"Country",
"Gender"
)]
ans.her <- survey[
grep("Yes", survey[["Eat Baltic herring"]]) ,
c(
"How often Baltic herring",
"How much Baltic herring",
"How often side Baltic herring",
"How much side Baltic herring",
"Ages",
"Country",
"Gender"
)]
quesl <- c(
"How often",
"How much",
"How often side",
"How much side",
"Ages",
"Country",
"Gender"
)
colnames(ans.sal) <- quesl
colnames(ans.her) <- quesl
ans <- rbind(ans.sal, ans.her)
head(ans)
qlen <- c(7,7,7,5) # Number of options in each question
questionl <- colnames(ans.sal)
questionl.her <- colnames(ans.her)
agel
countryl
genderl
fisl
mod <- textConnection("
model{
# Data1: Full questionnaire
for(f in 1:2) { # Salmon or herring
for(i in 1:U) { # Full questionnaire
consume[i,f] ~ dbern(p[f,ageU[i],countryU[i],genderU[i]])
}
}
# Data2: Those responses that reported either Baltic herring or salmon
for(q in 1:Q) { # Q = # of fish-specific questions
for(i in 1:S) { # S = # respondents who eat Baltic salmon or herring
ans[i,q] ~ dbin(p2[fis[i],age[i],country[i],gender[i],q], qlen[q])
}
}
# Priors (multidimensional)
for(i in 1:A) { # Age groups
for(j in 1:C) { # Countries
for(k in 1:G) { #Genders
for(f in 1:2) { # Fish species
p[f,i,j,k] ~ dbeta(1,1)
for(q in 1:Q) { # Q = # of fish-specific questions
p2[f,i,j,k,q] ~ dbeta(1,1)
ans.pred[f,i,j,k,q] ~ dbin(p2[f,i,j,k,q], qlen[q])
}
}
}
}
}
}
")
jags <- jags.model(
mod,
data = list(
U = nrow(surv),
Q = 4,
S = nrow(ans),
consume = consume,
ans = data.matrix(ans[1:4]) - 1, # Zero is the first option
qlen = qlen-1, # Subtract 1 because starts from 0.
age = match(ans$Ages, agel),
country = match(ans$Country, countryl),
gender = match(ans$Gender, genderl),
ageU = match(survey$Ages, agel),
countryU = match(survey$Country, countryl),
genderU = match(survey$Gender, genderl),
fis = c(rep(1, nrow(ans.sal)), rep(2, nrow(ans.her))),
A = length(agel),
C = length(countryl),
G = length(genderl)
),
n.chains = 4,
n.adapt = 100
)
update(jags, 100)
samps <- jags.samples(jags, c('ans.pred','p','p2'), 1000)
#samps.coda <- coda.samples(jags, c('mu', 'pcd.pred', 'ans.pred'), 1000)
ans.pred <- array(
samps$ans.pred,
dim = c(length(fisl), length(agel), length(countryl), length(genderl), length(quesl), 1000, 4),
dimnames = list(
Fish = fisl,
Age = agel,
Country = countryl,
Gender = genderl,
Question = quesl,
Iter = 1:1000,
Seed = c("S1","S2","S3","S4")
)
)
p.pred <- array(
samps$p,
dim = c(length(fisl), length(agel), length(countryl), length(genderl), 1000, 4),
dimnames = list(
Fish = fisl,
Age = agel,
Country = countryl,
Gender = genderl,
Iter = 1:1000,
Seed = c("S1","S2","S3","S4")
)
)
p2.pred <- array(
samps$p2,
dim = c(length(fisl), length(agel), length(countryl), length(genderl), length(quesl), 1000, 4),
dimnames = list(
Fish = fisl,
Age = agel,
Country = countryl,
Gender = genderl,
Question = quesl,
Iter = 1:1000,
Seed = c("S1","S2","S3","S4")
)
)
ansm <- melt(ans.pred)
pm <- melt(p.pred)
p2m <- melt(p2.pred)
scatterplotMatrix(t(ans.pred[1,1,,1,1,,1]))
scatterplotMatrix(t(p2.pred[1,1,,1,1,,1]))
scatterplotMatrix(t(p2.pred[,1,3,1,1,,1]))
scatterplotMatrix(t(p2.pred[1,,3,1,1,,1]))
scatterplotMatrix(t(p2.pred[1,1,3,,1,,1]))
scatterplotMatrix(t(p2.pred[1,1,3,1,,,1]))
scatterplotMatrix(t(p.pred[1,1,,1,,1]))
scatterplotMatrix(t(p.pred[,1,3,1,,1]))
scatterplotMatrix(t(p.pred[1,,3,1,,1]))
scatterplotMatrix(t(p.pred[1,1,3,,,1]))
#objects.store(ans.pred, p.pred, p2.pred)
#cat("Arrays ans.pred (answer), p.pred (probability to eat), p2.pred (p of answer) stored.\n")
| |