Working with Vowels part 1:
Cleaning and formatting vowel data
Matthew Winn
University of Washington
1 - Download the vowel measurement database
2 - Tidy it up
3 - Add IPA symbols
4 - Format it in the styles that will facilitate easy plotting
5 - Summarize it to obtain easy comparisons
6 - Save objects to use in the next script (for plotting)
library("ggplot2")
library("reshape2")
library("dplyr")
library("devtools")
Optional: remove all objects in the workspace
this is usually a good idea simply to ensure that your code is reproducible from top to bottom.
rm(list=ls())
Declare the destination for any objects that are produced (Fill in your own path here)
folder_to_save_objects <- "C:\\Users\\Matt\\Documents\\Hillenbrand"
If you have already downloaded the table of vowel measurements, guide to that directory and retrieve it
vowel_data_path <- "C:\\Users\\Matt\\Documents\\Hillenbrand\\Database"
setwd(vowel_data_path)
rawdata <- read.table("Hillenbrand_vowels.txt", header=FALSE)
If you don’t already have the table, download it from the internet here:
load_online_vowel_database <- function(){
online_vowel_database <- "http://homepages.wmich.edu/~hillenbr/voweldata/bigdata.dat"
temp <- readLines(online_vowel_database) %>%
data.frame
# omit multiple blank spaces
temp <- temp[-c(1:43),] %>% data.frame(value=.)
temp$value <- gsub(pattern = " ", replacement = " ", temp$value)
temp$value <- gsub(pattern = " ", replacement = " ", temp$value)
temp$value <- gsub(pattern = " ", replacement = " ", temp$value)
column_names <- c("file","Duration","f0ss","f1ss","f2ss","f3ss",
"f110","f210","f310",
"f120","f220","f320",
"f130","f230","f330",
"f140","f240","f340",
"f150","f250","f350",
"f160","f260","f360",
"f170","f270","f370",
"f180","f280","f380")
temp2 <- colsplit(temp$value, pattern = " ", names = column_names)
return(temp2)
}
rawdata <- load_online_vowel_database()
The data doesn’t come with column names, so let’s create them here (redundant if you already downloaded a cleaned copy)
colnames(rawdata) <- c("file","Duration",
"f0ss","f1ss","f2ss","f3ss",
"f110","f210","f310",
"f120","f220","f320",
"f130","f230","f330",
"f140","f240","f340",
"f150","f250","f350",
"f160","f260","f360",
"f170","f270","f370",
"f180","f280","f380")
Hillenbrand et al. set all missing measurments as 0. This will be a problem because they will spuriously lower group averages. So let’s replace all zeros with NA
s, which are handled more gracefully
rawdata[rawdata == 0] <- NA
# Count the total number of NA values
sum(is.na(rawdata))
## [1] 472
# Remove known mis-tracked file
rawdata <- rawdata[rawdata$file != "w40uh",]
Here’s an example of data in “wide” form:
## Time Group_A Group_B Group_C
## 1 1 10 40 70
## 2 2 20 50 80
## 3 3 30 60 90
and here’s the same data in “long” form:
## Time Group value
## 1 1 A 10
## 2 2 A 20
## 3 3 A 30
## 4 1 B 40
## 5 2 B 50
## 6 3 B 60
## 7 1 C 70
## 8 2 C 80
## 9 3 C 90
This is calt melt
ing the data.
Here’s the code to make it happen. In addition to identifying measurements by the sound file, we’re also using Duration as ID because it is a constant for each file.
data.melt <- melt(rawdata, id.vars = c("file","Duration"))
data.melt$Gender <- ifelse(substr(rawdata$file,1,1) %in% c("m","b"), "Male", "Female")
data.melt$Age <- ifelse(substr(rawdata$file,1,1) %in% c("m","w"), "Adult", "Child")
# Now that we have those groups, we can omit the w,m,g,b labels
# Get talker ID number
data.melt$Talker <- substr(data.melt$file,2,3)
# Get two-letter vowel ID code
# Set type as factor so it can be ordered later
data.melt$Vowel <- substr(data.melt$file,4,5) %>% as.factor
# First, see the automatic ordering
levels(data.melt$Vowel)
## [1] "ae" "ah" "aw" "eh" "ei" "er" "ih" "iy" "oa" "oo" "uh" "uw"
# For that order, here are the corresponding Unicode symbols
IPA_vowels <- c("\u00E6", "\u0251", "\u0254", "\u025B", "e\u026A","\u025D", "\u026A", "i", "o\u028A", "\u028A", "\u028C", "u")
# Take a look at the output of the Unicode
IPA_vowels
## [1] "æ" "<U+0251>" "<U+0254>" "<U+025B>" "e<U+026A>" "<U+025D>" "<U+026A>" "i" "o<U+028A>" "<U+028A>" "<U+028C>" "u"
# Translation of code:
# Index `[]` the list of vowel IPA symbols
# using the vector of values in the Vowel column
# as if the Vowels were treated as their corresponding numeric indices
# in the automatic ordering.
data.melt$Vowel.IPA <- IPA_vowels[as.numeric(data.melt$Vowel)]
Coordinate a set path for vowel space so that it follows the space around the periphery.
Vowel_space_path <- c("iy", "ei", "ih", "eh", "ae", "ah", "aw", "uh", "oa", "oo", "uw", "er")
Establish a column of vowel labels that is a copy of the Vowel column, but with an ordering that reflects the desired ordering of the vowel space path
data.melt$Vowel.ordered <- factor(data.melt$Vowel, levels = Vowel_space_path)
# Explicitly declare the number of each vowel in the vowel space path order
data.melt$Vowel.order.num <- as.numeric(data.melt$Vowel.ordered)
All of this vowel copying and ordering will make more sense when we make plots later.
Declare which measurement is made (e.g. F0, f1, f2, f3) by separating it from the time point (‘ss’ or 10-80)
data.melt$measure <- substr(data.melt$variable, 1,2)
# take a look at what that line of code did,
# by sampling some rows with unique measurement types:
data.melt[c(1, 8000, 9000, 30600), c("variable","measure")]
## variable measure
## 1 f0ss f0
## 8000 f110 f1
## 9000 f210 f2
## 30600 f350 f3
Get the time from the name of the variable being measured, which is indicated by the 3rd and 4th character.
Convert it to numeric so that numbers are automatically handled gracefully, and so steady-state “ss” measurements will be coded as NA.
You will receive a warning for the steady-state measurements, and this is normal.
data.melt$Time <- substr(data.melt$variable, 3,4) %>% as.numeric
## Warning in function_list[[k]](value): NAs introduced by coercion
# Let's check to make sure that worked
unique(data.melt[, c("variable","Time")])
## variable Time
## 1 f0ss NA
## 1668 f1ss NA
## 3335 f2ss NA
## 5002 f3ss NA
## 6669 f110 10
## 8336 f210 10
## 10003 f310 10
## 11670 f120 20
## 13337 f220 20
## 15004 f320 20
## 16671 f130 30
## 18338 f230 30
## 20005 f330 30
## 21672 f140 40
## 23339 f240 40
## 25006 f340 40
## 26673 f150 50
## 28340 f250 50
## 30007 f350 50
## 31674 f160 60
## 33341 f260 60
## 35008 f360 60
## 36675 f170 70
## 38342 f270 70
## 40009 f370 70
## 41676 f180 80
## 43343 f280 80
## 45010 f380 80
Yes, there are correct time values for all different measurement types except for “ss”
Get rid of the “variable” column, as all the info has been represented elsewhere
data.melt$variable <- NULL
Extract only F0 info in its own data.frame, copy the measurement into a new column called ‘f0’
data.f0 <- data.melt %>%
dplyr::filter(measure=="f0") %>%
mutate(f0 = value) %>%
select(-value)
!is.na()
call)
The last couple lines join this data frame with the F0 measurements that were just aggregated.
data.ss <- data.melt[is.na(data.melt$Time),] %>%
arrange(file, Vowel.order.num, measure, Time) %>%
mutate(formant = substr(measure, 2,2) %>%
as.character) %>%
filter(measure != "f0") %>%
left_join(., data.f0[,c("file","f0")]) %>%
mutate(Frequency = value) %>%
select(-value)
# Note that without the last two links in the pipe chain,
# "value" is the column representing the formant frequency,
# and it is located pretty far to the left, while the formant number
# is located pretty far to the right.
# to make this easier to work with, the "value" column is copied
# to a new column called "Frequency".
Join the melted data frame with F0 info using only rows that contain a valid time value (i.e. leave out steady-state values). With this data.frame, we can plot time-varying spectrogram-like plots, where formants are grouped by their number (f1, f2, f3)
data.long <- data.melt[!is.na(data.melt$Time),] %>%
arrange(file, Vowel.order.num, measure, Time) %>%
mutate(formant = substr(measure, 2,2) %>% as.character) %>%
left_join(., data.f0[,c("file","f0")]) %>%
mutate(Frequency = value) %>%
select(-value)
# We won't need these objects anymore, so clear them from the workspace
rm(rawdata, data.melt)
In other words, we need a pseudo-wide-format data frame. The following code will align f1, f2 and f3 values side by side in separate columns rather than stacked in a single column.
Omit the measurement and formant columns because they are no longer needed to clarify the single long column. Then, because the formant frequency column (“Frequency”) is re-named with the appropriate formant label (e.g “f1”), the “Frequency” column is dropped as well.
keep_one_formant <- function(data, formant){
temp <- data[data$measure==formant,] %>%
select(-measure, -formant) %>%
mutate(temp.formant = Frequency) %>%
select(-Frequency)
# rename the temporary column to match the input argument
names(temp)[names(temp)=="temp.formant"] <- formant
return(temp)
}
Extracting one formant series at a time, extract f1 and merge it with f2, then merge that object with the f3 data (because left_join
accepts only two objects to join). Ensure that Time order is preserved within each group. This is basically a customized way of dcast
ing or spread
ing the data.
data.wide <- data.long %>%
keep_one_formant(., "f1") %>%
left_join(., keep_one_formant(data.long, "f2")) %>%
left_join(., keep_one_formant(data.long, "f3")) %>%
group_by(file) %>%
arrange(Time) %>% ungroup
Do the same process for the steady-state measurements, (separate columns for each formant measurement), … but now we can discard all the treatment of Time.
data.wide.ss <- data.ss %>%
keep_one_formant(., "f1") %>%
left_join(., keep_one_formant(data.ss, "f2")) %>%
left_join(.,keep_one_formant(data.ss, "f3")) %>%
select(-Time)
Note how we are grouping by Vowel, but are preserving the ordered & labeled Vowel columns by also grouping by those columns as well, even though it is redundant.
data.wide.ss.sum <- data.wide.ss %>%
group_by(Gender, Age,
Vowel, Vowel.ordered, Vowel.order.num, Vowel.IPA) %>%
summarise(Duration = mean(Duration, na.rm = TRUE),
f0.sd= (sd(f0, na.rm = TRUE)) %>% round(., 2),
f1.sd= (sd(f1, na.rm = TRUE)) %>% round(., 1),
f2.sd= (sd(f2, na.rm = TRUE)) %>% round(., 1),
f3.sd= (sd(f3, na.rm = TRUE)) %>% round(., 1),
f0= mean(f0, na.rm = TRUE) %>% round(., 1),
f1= mean(f1, na.rm = TRUE) %>% round(., 0),
f2= mean(f2, na.rm = TRUE) %>% round(., 0),
f3= mean(f3, na.rm = TRUE) %>% round(., 0))
Here, we’ll summarize different vowels over time, for each talker group.
Recall that the Time vaiable has already been arranged # by an earlier step that created the data.wide
data frame.
data.wide.visc.sum <- data.wide %>%
group_by(Gender, Age, Vowel, Vowel.IPA, Time) %>%
summarise(f1.sd = sd(f1, na.rm=TRUE),
f2.sd = sd(f2, na.rm=TRUE),
f3.sd = sd(f3, na.rm=TRUE),
f1 = mean(f1, na.rm=TRUE),
f2 = mean(f2, na.rm=TRUE),
f3 = mean(f3, na.rm=TRUE),
Duration.sd = sd(Duration, na.rm=TRUE),
Duration = mean(Duration, na.rm=TRUE)) %>%
ungroup %>%
# Encode duration as a normalized value between 0 and 1
group_by(Age, Gender) %>%
mutate(Duration_norm = (Duration - min(Duration)) /
((max(Duration) - min(Duration)))) %>%
# Encode how much formant movement was observed
# (this is a linear value in the Hz domain;
# there are other ways of doing this)
group_by(Age, Gender, Vowel) %>%
mutate(f1visc = max(f1) - min(f1),
f2visc = max(f2) - min(f2),
f3visc = max(f3) - min(f3)) %>%
ungroup
First, be sure to declare an actual existing filepath/directory to save your objects.
files_to_save <- c("data.wide", "data.ss", "data.f0", "data.long", "data.wide.ss", "data.wide.ss.sum", "data.wide.visc.sum", "Vowel_space_path", "IPA_vowels")
setwd(folder_to_save_objects)
save(list = files_to_save, file = "HB95_data.RData")
We have now created tidy data frames to analyze and plot the data.