#!/usr/local/bin/perl
#
# qancord.pl
# Clifton Pye
# This program produces a lexical concordance for a unicode text file exported from Elan via Excel.
# The exported text file should have a header row and be saved in the UTF-8 format with the .txt extension

# Clear word hash
%words = ();
$extension = ".txt";
$concord = "con";
print "What is the file you wish to analyze? \n";
print "Type the filename without an extension \n";
print "The extension should be txt \n\n";

$textfile = <>;                                   # Read the filename from the keyboard input
chop $textfile;                                                # Remove line return
$file = $textfile . $extension;
print "Now analyzing $textfile\n\n";
open text_in, "< $file" or die( "Could not open $file" );
$line = <text_in>;			# Read header
chomp($line);                                   # Remove line returns
@tiers = split ("\t", $line );                   # Count the number of tiers

if ( scalar( @tiers ) >= 4 ) {  #Process files with 4 tiers

  while ($line = <text_in>) {
     $line_no = $line_no + 1;                  # Count the line number
     chomp($line);                                   # Remove line returns
     @tiers = split ("\t", $line );                   # Split line into tiers
  	 $adult = $tiers[ 2 ];
	   $adult = lc($adult);                               # Change to lower case
     $adult = ' ' . $adult . ' ';                           # Add spaces
     $adult =~ s/["(),;:.!?]/ /g;                    # Remove punctuation
     @words = split (" ", $adult );                         # Put words into an array
        foreach $word (@words) {
             $words{$word} = $words{$word} . $tiers[ 0 ] . '	' . $tiers[ 1 ] . '		 ' . $tiers[ 2 ] . '	' . $tiers[ 3 ] . '#';              # Put lines in hash
        } #end foreach word
	 	 
  } #end while
} #end if

else {  #Process files with 3 tiers

  while ($line = <text_in>) {
     $line_no = $line_no + 1;                  # Count the line number
     chomp($line);                                   # Remove line returns
     @tiers = split ("\t", $line );                   # Split line into tiers
  	 $adult = $tiers[ 1 ];
	   $adult = lc($adult);                               # Change to lower case
     $adult = ' ' . $adult . ' ';                           # Add spaces
     $adult =~ s/["(),;:.!?]/ /g;                    # Remove punctuation
     @words = split (" ", $adult );                         # Put words into an array
        foreach $word (@words) {
             $words{$word} = $words{$word} . $tiers[ 0 ] . '	' . $tiers[ 1 ] . '		 ' . $tiers[ 2 ] . '#';              # Put lines in hash
        } #end foreach word
	 	 
  } #end while

} #end else

close text_in;

open text_out, "> $textfile . $concord" or die( "Could not open the output file" );

# Display the results

foreach $word (sort keys %words) {              # Sort the word hash
   @lines = split ("#", $words{$word} );         # Put lines into an array
   print text_out "$word";                                            # Print word
      foreach $line (@lines) {
         print text_out "		$line\n";                                      # Print line
      } #end foreach line
} #end foreach word