package Lingua::ES::Hyphenate; use strict; use warnings; require Exporter; our @ISA = qw(Exporter); our %EXPORT_TAGS = ( 'all' => [ qw( hyphenate syllable_cnt ) ] ); our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); our @EXPORT = qw( hyphenate syllable_cnt ); our $VERSION = '.01'; =head1 NAME Lingua::ES::Hyphenate - Separates Spanish words into syllables =head1 SYNOPSIS use Lingua::ES::Hyphenate; @syllables = hyphenate('oportunidad')# @syllables now holds ('o','por','tu','ni','dad') # or $word = new Lingua::ES::Hyphenate->new('oportunidad'); @syllables = $word->hyphenate; =head1 DESCRIPTION Separates Spanish words into syllables. =head1 SPANISH SYLLABLE STRUCTURE The Spanish syllable structure can be summarized as follows: C1 C2 S1 V S2 C3 C4 Spanish syllable structure allows a maximum of two consonants in its onset, a nucleus of a vowel followed by and/or preceded by a semivowel, and a maximum of two consonants in its coda. The following restrictions apply: Onset First consonant (C1): Can be any consonant. Second consonant (C2): If and only if the first consonant is a plosive /p, t, k, b, d, g/ or a voiceless labiodental fricative /f/, then the second consonant can be a liquid /l, r/. Although they occur, the onsets /tl/ and /dl/ are not native to Spanish. Nucleus Semivowel (S1) Vowel (V) Semivowel (S2) Coda First consonant (C3): Can be any consonant. Second consonant (C4): Must be /s/. =head1 SEE ALSO http://en.wikipedia.org/wiki/Spanish_phonology#Phonotactics =cut my $cnt;# global variable for number of syllables in last parsed word my $letters = qr/[AÁBCDEÉFGHIÍJKLMNÑñOPQRSTUÚVWXYZ]/i;# Apparently perl doesn't know that ñ is lowercase for Ñ #prevent backtracking here; otherwise two letter consonants won't work. my $anyCons = qr/(?>RR|LL|CH|QU|[BCDFGHJKLMNÑPQRSTVWXYZ])/i;# any consonant my $preR = qr/[PKCBGFTD]/i; # These may precede R in an onset my $preL = qr/[PKCBGF]/i; # These may precede L in an onset my $C2 = qr/ (?<=^$preR)L # At the beginning of a word, a TL or DL (loan words) | # or (?<=$preR)R # PR KR CR BR GR FR TR DR | # OR (?<=$preL)L # PL KL CL BL GL FL /ix;# my $onset = qr/$anyCons$C2?/i;# C2 is optional my $semiVowel = qr/[UI]/i; my $vowel = qr/[AÁEÉOÓÍÚ]/i; my $allVows = qr/[UIAÁEÉOÓÍÚ]/i; my $nucleus = qr/(?:$semiVowel?$vowel$semiVowel?)|$semiVowel/i; my $coda = qr/${anyCons}S?/i;# separate $C4 variable seemed worthless. my $syllable = qr/ $onset? # onsets are optional $nucleus # nuclei are not optional (?: $coda # We must make sure that the letters after the coda cannot be an # onset to another syllable; if they are, we forget the coda and # parse the next consonants as the onset of the next syllable. (?(?<=$preL) # IF the matched $coda was a pre L consonant (?!L) # don't match a following L ) (?(?<=$preR) # IF the matched $coda was a pre R consonant (?!R) # don't match a following R ) (?!$allVows) # don't match a following vowel or semivowel )? # coda is optional /ix;# ignore case =head1 CONSTRUCTOR Not necessary, since functions are exported. my $hyphenater = Lingua::ES::Hyphenate->new('charlar'); =cut sub new { my ($self, $word) = @_; bless \$word, $self; } =head1 hyphenate Returns array of syllables from input word. my $hyphenater = Lingua::ES::Hyphenate->new('charlar'); @syllabes = $hyphenater->hyphenate(); # or @syllables = hyphenate('tomarlo') =cut sub hyphenate { $_[0] || return (); my $word; if (ref($_[0]) eq 'Lingua::ES::Hyphenate') { my $self = shift; $word = $$self; } else { $word = shift; } $word =~ /^$letters+$/ || return (); $cnt = $word =~ s/$syllable/$&=/g; split '=', $word; } =head1 syllable_cnt Returns number of syllables in string argument. If no argument is provided, returns the number of syllables in the last word parsed. my $cnt = syllable_cnt('tomarlo'); # or my $hyphenater = Lingua::ES::Hyphenate->new('charlar'); my $cnt = $hyphenater->syllable_cnt('escuela'); # or my @syllables = hyphenate('majaderías'); $cnt = syllable_cnt(); # same as $cnt = @syllables; =cut sub syllable_cnt{ my $word = ''; if (ref($_[0]) eq 'Lingua::ES::Hyphenate') { my $self = shift; $word = $$self; $cnt = $word =~ s/$syllable//g; return $cnt; } elsif(@_ == 1) { $word = shift; } if($word ne '') { $cnt = $word =~ s/$syllable//g; return $cnt; } return $cnt; # default: return number of syllables in last word } 1; =head1 AUTHOR Nathan Glenn, =head1 COPYRIGHT AND LICENSE Copyright 2010 by Nathan Glenn This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 NEEDS WORK Atlanta splits as 'A-tlan-ta'. Is that correct? 'tl' and 'dl' and not native sounds, and Atlanta is a lone word, so maybe it's okay. 'At-lan-ta' seems more natural to me. =cut