Chapter 5 Polynomial and Splines
5.1 Polynomaials
level 1: \[ {Y}_{ij} = \beta_{0j} + \beta_{1j}(Time_{ij} - \bar{X)} + \beta_{2j}(Time_{ij} - \bar{X)}^2 + \varepsilon_{ij} \]
Level 2: \[ {\beta}_{0j} = \gamma_{00} + U_{0j}\]
\[ {\beta}_{1j} = \gamma_{10} + U_{1j} \] \[ {\beta}_{2j} = \gamma_{20} + U_{2j} \]
5.2 polynomial example
rm(list = ls())
library(readr)
cdrs <- read_csv("~/Box Sync/5165 Applied Longitudinal Data Analysis/Longitudinal/cdrs.csv")
## Parsed with column specification:
## cols(
## mapid = col_integer(),
## exclude = col_character(),
## cdr = col_double(),
## testdate = col_integer()
## )
personality <- read_csv("~/Box Sync/5165 Applied Longitudinal Data Analysis/Longitudinal/Subject_personality.csv")
## Parsed with column specification:
## cols(
## mapid = col_integer(),
## age = col_integer(),
## neodate = col_integer(),
## neuroticism = col_integer(),
## extraversion = col_integer(),
## openness = col_integer(),
## agreeablness = col_integer(),
## conscientiousness = col_integer(),
## gender = col_character()
## )
library(ggplot2)
gg1 <- ggplot(personality,
aes(x = neodate, y = neuroticism, group = mapid)) + geom_line()
gg1
## Warning: Removed 1 rows containing missing values (geom_path).
library(tidyverse)
personality<- personality %>%
group_by(mapid) %>%
arrange(neodate) %>%
mutate(wave = seq_len(n()))
gg2 <- ggplot(personality,
aes(x = wave, y = neuroticism, group = mapid)) + geom_line()
gg2
personality$neodate <- as.Date(personality$neodate, origin = "1900-01-01")
## Warning in strptime(xx, f <- "%Y-%m-%d", tz = "GMT"): unknown timezone
## 'zone/tz/2017c.1.0/zoneinfo/America/Chicago'
gg3 <- ggplot(personality,
aes(x = neodate, y = neuroticism, group = mapid)) + geom_line()
gg3
## Warning: Removed 1 rows containing missing values (geom_path).
## convert to days from first assessment
personality.wide <- personality %>%
dplyr::select(mapid, wave, neodate) %>%
spread(wave, neodate)
personality.wide$wave_1 <- personality.wide$'1'
personality.wide$wave_2 <- personality.wide$'2'
personality.wide$wave_3 <- personality.wide$'3'
personality.wide$wave_4 <- personality.wide$'4'
personality.wide$wave_5 <- personality.wide$'5'
personality.wide <- personality.wide %>%
mutate (w_1 = (wave_1 - wave_1)/365,
w_2 = (wave_2 - wave_1)/365,
w_3 = (wave_3 - wave_1)/365,
w_4 = (wave_4 - wave_1)/365,
w_5 = (wave_5 - wave_1)/365)
personality.long <- personality.wide %>%
dplyr::select(mapid, w_1:w_5) %>%
gather(wave, year, -mapid) %>%
separate(wave, c('weeks', 'wave' ), sep="_") %>%
dplyr::select(-weeks)
personality.long$wave <- as.numeric(personality.long$wave)
personality <- personality %>%
left_join(personality.long, by = c('mapid', 'wave' ))
gg4 <- ggplot(personality,
aes(x = year, y = neuroticism, group = mapid)) + geom_line()
gg4
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## Warning: Removed 1 rows containing missing values (geom_path).
library(lme4)
p1 <- lmer(neuroticism ~ year + (1 | mapid), data=personality)
summary(p1)
## Linear mixed model fit by REML ['lmerMod']
## Formula: neuroticism ~ year + (1 | mapid)
## Data: personality
##
## REML criterion at convergence: 13657.4
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.7877 -0.4675 -0.0227 0.4289 3.3166
##
## Random effects:
## Groups Name Variance Std.Dev.
## mapid (Intercept) 42.16 6.493
## Residual 15.65 3.956
## Number of obs: 2105, groups: mapid, 1090
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 16.05632 0.22577 71.12
## year -0.13204 0.03247 -4.07
##
## Correlation of Fixed Effects:
## (Intr)
## year -0.247
library(lme4)
personality.s <- personality %>%
group_by(mapid) %>%
tally() %>%
filter(n >=2)
personality <- personality %>%
filter(mapid %in% personality.s$mapid)
p2 <- lmer(neuroticism ~ year + (1 | mapid), data=personality)
summary(p2)
## Linear mixed model fit by REML ['lmerMod']
## Formula: neuroticism ~ year + (1 | mapid)
## Data: personality
##
## REML criterion at convergence: 10396.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.7542 -0.5122 -0.0282 0.4698 3.3369
##
## Random effects:
## Groups Name Variance Std.Dev.
## mapid (Intercept) 40.92 6.397
## Residual 15.61 3.950
## Number of obs: 1635, groups: mapid, 620
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 15.3797 0.2915 52.76
## year -0.1083 0.0331 -3.27
##
## Correlation of Fixed Effects:
## (Intr)
## year -0.320
p3 <- lmer(neuroticism ~ year + (year | mapid), data=personality)
summary(p3)
## Linear mixed model fit by REML ['lmerMod']
## Formula: neuroticism ~ year + (year | mapid)
## Data: personality
##
## REML criterion at convergence: 10389.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.7438 -0.4825 -0.0305 0.4443 3.3453
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## mapid (Intercept) 41.68558 6.4564
## year 0.09824 0.3134 -0.10
## Residual 14.25791 3.7760
## Number of obs: 1635, groups: mapid, 620
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 15.37238 0.29135 52.76
## year -0.10272 0.03602 -2.85
##
## Correlation of Fixed Effects:
## (Intr)
## year -0.317
5.2.1 importance of centering
personality$year <- as.numeric(personality$year)
p4 <- lmer(neuroticism ~ year + I(year^2) + (year | mapid), data=personality)
summary(p4)
## Linear mixed model fit by REML ['lmerMod']
## Formula: neuroticism ~ year + I(year^2) + (year | mapid)
## Data: personality
##
## REML criterion at convergence: 10395.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.7663 -0.4836 -0.0251 0.4422 3.3258
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## mapid (Intercept) 41.72924 6.4598
## year 0.09815 0.3133 -0.10
## Residual 14.26217 3.7765
## Number of obs: 1635, groups: mapid, 620
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 15.324317 0.297094 51.58
## year -0.031791 0.092091 -0.35
## I(year^2) -0.008789 0.010490 -0.84
##
## Correlation of Fixed Effects:
## (Intr) year
## year -0.300
## I(year^2) 0.194 -0.920
# woah, how do I interpret this? WHy all of a sudden non-sig?
# what would happen if I changed my time metric?
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:merTools':
##
## ICC
## The following objects are masked from 'package:arm':
##
## logit, rescale, sim
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
describe(personality$year)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 1635 3.1 3.29 2.45 2.66 3.63 0 12.78 12.78 0.8 -0.41
## se
## X1 0.08
personality$year.c <- personality$year - 3.1
p5 <- lmer(neuroticism ~ year.c + I(year.c^2) + (year.c | mapid), data=personality)
summary(p5)
## Linear mixed model fit by REML ['lmerMod']
## Formula: neuroticism ~ year.c + I(year.c^2) + (year.c | mapid)
## Data: personality
##
## REML criterion at convergence: 10395.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.7663 -0.4836 -0.0251 0.4422 3.3258
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## mapid (Intercept) 41.43047 6.4367
## year.c 0.09815 0.3133 0.05
## Residual 14.26218 3.7765
## Number of obs: 1635, groups: mapid, 620
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 15.141299 0.296073 51.14
## year.c -0.086285 0.041061 -2.10
## I(year.c^2) -0.008789 0.010490 -0.84
##
## Correlation of Fixed Effects:
## (Intr) year.c
## year.c 0.226
## I(year.c^2) -0.353 -0.480
5.2.2 random terms
fitting a random slope plus a random quadratic leads to difficulties ie non-congergence. What does this model say?
p6 <- lmer(neuroticism ~ year + I(year^2) + ( I(year^2) | mapid), data=personality)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
## - Rescale variables?
summary(p6)
## Linear mixed model fit by REML ['lmerMod']
## Formula: neuroticism ~ year + I(year^2) + (I(year^2) | mapid)
## Data: personality
##
## REML criterion at convergence: 10398.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.7737 -0.4938 -0.0201 0.4526 3.3479
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## mapid (Intercept) 4.078e+01 6.38610
## I(year^2) 4.854e-04 0.02203 0.02
## Residual 1.506e+01 3.88052
## Number of obs: 1635, groups: mapid, 620
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 15.321512 0.296443 51.68
## year -0.026969 0.093406 -0.29
## I(year^2) -0.009489 0.010731 -0.88
##
## Correlation of Fixed Effects:
## (Intr) year
## year -0.300
## I(year^2) 0.202 -0.928
## convergence code: 0
## Model is nearly unidentifiable: very large eigenvalue
## - Rescale variables?
5.3 Splines aka piecewise
Fit more than 1 trajectory. Best to use when we have a reason for a qualitative difference at some identified time point. For example, before your health event you may have a different trajectory than after it and thus you would want to model two seperate trajectories. Splines allow you to do this in a single model. You can do this in simple regression and the logic follows for growth models.
We simply replace time with dummy variables that represent different segments we wish to model. The point of separation is called a knot. You can have as many as you want and these can be pre-specified (usually for our case) or in more advanced treatments have the data specify it for you.
5.3.1 seperate curves
The most common is to create different trajectories that change across knots. The easiest example is to take your time variable and transform it into a Time1 and time2, that represent the different time periods. This is easiest to see if we choose our wave variable as our time metric, though you do not have to necessarily do it this way.
t1 <- tribble(
~time, ~t0, ~t1,~t2,~t3,~t4,~t5,
"time 1", 0, 1,2,2,2,2,
"time 2", 0, 0,0,1,2,3
)
t1
## # A tibble: 2 x 7
## time t0 t1 t2 t3 t4 t5
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 time 1 0 1 2 2 2 2
## 2 time 2 0 0 0 1 2 3
The idea is that once you hit the knot your value stays the same. Same logic for the second knot, until you get to that knot you dont have a trajectory.
5.3.2 incremental curves
This can be contrasted with a different type of coding, called incremental. Here the first trajectory keeps going, whereas the second trajectory starts at the position of the knot.
t2 <- tribble(
~time, ~t0, ~t1,~t2,~t3,~t4,~t5,
"time 1", 0, 1,2,3,4,5,
"time 2", 0, 0,0,1,2,3
)
t2
## # A tibble: 2 x 7
## time t0 t1 t2 t3 t4 t5
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 time 1 0 1 2 3 4 5
## 2 time 2 0 0 0 1 2 3
The two coding schemes propose the same type of trajectoy, the only thing that differes is the interpretation of the coefficients.
In the first, the two slope coefficients represent the actual slope in the respective time period.
In the second, the coefficient for time 2 represents the deviation from the slope in period 1. The positive of this second method is you can easily test whether these two slopes are different from one another.
level 1:
\[ {Y}_{ij} = \beta_{0j} + \beta_{1j}Time1_{ij} + \beta_{2j}Time2_{ij} + \varepsilon_{ij} \]
Level 2: \[ {\beta}_{0j} = \gamma_{00} + U_{0j} \]
\[ {\beta}_{1j} = \gamma_{10} + U_{1j} \] \[ {\beta}_{2j} = \gamma_{20} + U_{2j} \]
5.3.3 splines example
personality$time1 <- recode(personality$wave, '1' = 0 , '2' = 1, '3' = 1, '4' = 1,'5' = 1)
personality$time2 <- recode(personality$wave, '1' = 0 , '2' = 0, '3' = 1, '4' = 2,'5' = 3)
p7 <- lmer(conscientiousness ~ time1 + time2 + (time1 | mapid) , data=personality)
summary(p7)
## Linear mixed model fit by REML ['lmerMod']
## Formula: conscientiousness ~ time1 + time2 + (time1 | mapid)
## Data: personality
##
## REML criterion at convergence: 10003.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.2557 -0.4068 0.0272 0.4304 4.5853
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## mapid (Intercept) 32.98 5.743
## time1 4.73 2.175 -0.13
## Residual 10.70 3.271
## Number of obs: 1635, groups: mapid, 620
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 34.1871 0.2654 128.80
## time1 -0.5365 0.2018 -2.66
## time2 0.2184 0.1561 1.40
##
## Correlation of Fixed Effects:
## (Intr) time1
## time1 -0.370
## time2 0.000 -0.301
gg5 <- ggplot(personality, aes(x = wave, y = conscientiousness, group = mapid)) + stat_smooth(method = 'lm', formula = y ~ poly(x,2, raw = TRUE),data = personality, aes(x = wave, y = conscientiousness, group=1)) + scale_y_continuous(limits = c(30, 40))
gg5
## Warning: Removed 609 rows containing non-finite values (stat_smooth).
5.4 splines + polynomail = polynomial piecewise
\[ {Y}_{ij} = \beta_{0j} + \beta_{1j}Time1_{ij} + \beta_{2j}Time1_{ij}^2 + \beta_{3j}Time2_{ij} + \varepsilon_{ij} \]
Level 2: \[ {\beta}_{0j} = \gamma_{00} + U_{0j} \]
\[ {\beta}_{1j} = \gamma_{10} + U_{1j} \] \[ {\beta}_{2j} = \gamma_{20} + U_{2j} \] \[ {\beta}_{3j} = \gamma_{30} + U_{3j}\]