#!/usr/bin/perl
# Name: guess_encoding.pl
# Author: Aleksey Tsalolikhin
# Date: 19 Oct 2012
# Description: guess encoding of a file (for example, to answer
# the question is the encoding UTF-8 or Latin-1?). This script
# uses the "file" utility but it is more precise than just
# running "file" on your input file, because "file" does not
# read the entire file before guessing what type of content is
# in it.
while ($data=<STDIN>) {
if ($data =~ /[^x00-x7F]/) {
# only process the line if it contains non-ASCII characters
$data =~ s/[x00-x7F]//g; # remove ASCII characters
print "Found non-ASCII character(s): $data. Checking encoding
with /usr/bin/file.n";
print `echo $data | /usr/bin/file – | sed -e 's#^/dev/stdin:
##'`;
}
}
This is more precise than just running "file" on your input file, because "file" does not read the entire file before guessing what type of content is in it.