I am including a solution that works "okay" in R. Far from perfect.
countSyllablesInWord = function(words)
{
#word = "super";
n.words = length(words);
result = list();
for(j in 1:n.words)
{
word = words[j];
vowels = c("a","e","i","o","u","y");
word.vec = strsplit(word,"")[[1]];
word.vec;
n.char = length(word.vec);
is.vowel = is.element(tolower(word.vec), vowels);
n.vowels = sum(is.vowel);
# nontrivial problem
if(n.vowels <= 1)
{
syllables = 1;
str = word;
} else {
# syllables = 0;
previous = "C";
# on average ?
str = "";
n.hyphen = 0;
for(i in 1:n.char)
{
my.char = word.vec[i];
my.vowel = is.vowel[i];
if(my.vowel)
{
if(previous == "C")
{
if(i == 1)
{
str = paste0(my.char, "-");
n.hyphen = 1 + n.hyphen;
} else {
if(i < n.char)
{
if(n.vowels > (n.hyphen + 1))
{
str = paste0(str, my.char, "-");
n.hyphen = 1 + n.hyphen;
} else {
str = paste0(str, my.char);
}
} else {
str = paste0(str, my.char);
}
}
# syllables = 1 + syllables;
previous = "V";
} else { # "VV"
# assume what ? vowel team?
str = paste0(str, my.char);
}
} else {
str = paste0(str, my.char);
previous = "C";
}
#
}
syllables = 1 + n.hyphen;
}
result[[j]] = list("syllables" = syllables, "vowels" = n.vowels, "word" = str);
}
if(n.words == 1) { result[[1]]; } else { result; }
}
Here are some results:
my.count = countSyllablesInWord(c("America", "beautiful", "spacious", "skies", "amber", "waves", "grain", "purple", "mountains", "majesty"));
my.count.df = data.frame(matrix(unlist(my.count), ncol=3, byrow=TRUE));
colnames(my.count.df) = names(my.count[[1]]);
my.count.df;
# syllables vowels word
# 1 4 4 A-me-ri-ca
# 2 4 5 be-auti-fu-l
# 3 3 4 spa-ci-ous
# 4 2 2 ski-es
# 5 2 2 a-mber
# 6 2 2 wa-ves
# 7 2 2 gra-in
# 8 2 2 pu-rple
# 9 3 4 mo-unta-ins
# 10 3 3 ma-je-sty
I didn't realize how big of a "rabbit hole" this is, seems so easy.
################ hackathon #######
# https://en.wikipedia.org/wiki/Gunning_fog_index
# THIS is a CLASSIFIER PROBLEM ...
# https://mcmap.net/q/151150/-detecting-syllables-in-a-word
# http://www.speech.cs.cmu.edu/cgi-bin/cmudict
# http://www.syllablecount.com/syllables/
# https://enchantedlearning.com/consonantblends/index.shtml
# start.digraphs = c("bl", "br", "ch", "cl", "cr", "dr",
# "fl", "fr", "gl", "gr", "pl", "pr",
# "sc", "sh", "sk", "sl", "sm", "sn",
# "sp", "st", "sw", "th", "tr", "tw",
# "wh", "wr");
# start.trigraphs = c("sch", "scr", "shr", "sph", "spl",
# "spr", "squ", "str", "thr");
#
#
#
# end.digraphs = c("ch","sh","th","ng","dge","tch");
#
# ile
#
# farmer
# ar er
#
# vowel teams ... beaver1
#
#
# # "able"
# # http://www.abcfastphonics.com/letter-blends/blend-cial.html
# blends = c("augh", "ough", "tien", "ture", "tion", "cial", "cian",
# "ck", "ct", "dge", "dis", "ed", "ex", "ful",
# "gh", "ng", "ous", "kn", "ment", "mis", );
#
# glue = c("ld", "st", "nd", "ld", "ng", "nk",
# "lk", "lm", "lp", "lt", "ly", "mp", "nce", "nch",
# "nse", "nt", "ph", "psy", "pt", "re", )
#
#
# start.graphs = c("bl, br, ch, ck, cl, cr, dr, fl, fr, gh, gl, gr, ng, ph, pl, pr, qu, sc, sh, sk, sl, sm, sn, sp, st, sw, th, tr, tw, wh, wr");
#
# # https://mantra4changeblog.wordpress.com/2017/05/01/consonant-digraphs/
# digraphs.start = c("ch","sh","th","wh","ph","qu");
# digraphs.end = c("ch","sh","th","ng","dge","tch");
# # https://www.education.com/worksheet/article/beginning-consonant-blends/
# blends.start = c("pl", "gr", "gl", "pr",
#
# blends.end = c("lk","nk","nt",
#
#
# # https://sarahsnippets.com/wp-content/uploads/2019/07/ScreenShot2019-07-08at8.24.51PM-817x1024.png
# # Monte Mon-te
# # Sophia So-phi-a
# # American A-mer-i-can
#
# n.vowels = 0;
# for(i in 1:n.char)
# {
# my.char = word.vec[i];
#
#
#
#
#
# n.syll = 0;
# str = "";
#
# previous = "C"; # consonant vs "V" vowel
#
# for(i in 1:n.char)
# {
# my.char = word.vec[i];
#
# my.vowel = is.element(tolower(my.char), vowels);
# if(my.vowel)
# {
# n.vowels = 1 + n.vowels;
# if(previous == "C")
# {
# if(i == 1)
# {
# str = paste0(my.char, "-");
# } else {
# if(n.syll > 1)
# {
# str = paste0(str, "-", my.char);
# } else {
# str = paste0(str, my.char);
# }
# }
# n.syll = 1 + n.syll;
# previous = "V";
# }
#
# } else {
# str = paste0(str, my.char);
# previous = "C";
# }
# #
# }
#
#
#
#
## https://jzimba.blogspot.com/2017/07/an-algorithm-for-counting-syllables.html
# AIDE 1
# IDEA 3
# IDEAS 2
# IDEE 2
# IDE 1
# AIDA 2
# PROUSTIAN 3
# CHRISTIAN 3
# CLICHE 1
# HALIDE 2
# TELEPHONE 3
# TELEPHONY 4
# DUE 1
# IDEAL 2
# DEE 1
# UREA 3
# VACUO 3
# SEANCE 1
# SAILED 1
# RIBBED 1
# MOPED 1
# BLESSED 1
# AGED 1
# TOTED 2
# WARRED 1
# UNDERFED 2
# JADED 2
# INBRED 2
# BRED 1
# RED 1
# STATES 1
# TASTES 1
# TESTES 1
# UTILIZES 4
And for good measure, a simple kincaid readability function ... syllables is a list of counts returned from the first function ...
Since my function is a bit biased towards more syllables, that will give an inflated readability score ... which for now is fine ... if the goal is to make text more readable, this is not the worst thing.
computeReadability = function(n.sentences, n.words, syllables=NULL)
{
n = length(syllables);
n.syllables = 0;
for(i in 1:n)
{
my.syllable = syllables[[i]];
n.syllables = my.syllable$syllables + n.syllables;
}
# Flesch Reading Ease (FRE):
FRE = 206.835 - 1.015 * (n.words/n.sentences) - 84.6 * (n.syllables/n.words);
# Flesh-Kincaid Grade Level (FKGL):
FKGL = 0.39 * (n.words/n.sentences) + 11.8 * (n.syllables/n.words) - 15.59;
# FKGL = -0.384236 * FRE - 20.7164 * (n.syllables/n.words) + 63.88355;
# FKGL = -0.13948 * FRE + 0.24843 * (n.words/n.sentences) + 13.25934;
list("FRE" = FRE, "FKGL" = FKGL);
}