On a project that I’ve been working on recently, I was having some trouble combining SQL scripts that where in a couple of different formats.
While there’s no easy way to detect all of the possible encodings, by checking the byte order mark (BOM) there is a pretty straight forward way to detect the following encodings:
public static Encoding GetFileEncoding(string path){ if (path == null) throw new ArgumentNullException("path");
var encodings = Encoding.GetEncodings() .Select(e => e.GetEncoding()) .Select(e => new { Encoding = e, Preamble = e.GetPreamble() }) .Where(e => e.Preamble.Any()) .ToArray();
var maxPrembleLength = encodings.Max(e => e.Preamble.Length); byte[] buffer = new byte[maxPrembleLength];
using (var stream = File.OpenRead(path)) { stream.Read(buffer, 0, (int)Math.Min(maxPrembleLength, stream.Length)); }
return encodings .Where(enc => enc.Preamble.SequenceEqual(buffer.Take(enc.Preamble.Length))) .Select(enc => enc.Encoding) .FirstOrDefault() ?? Encoding.Default;}