package misspell

import (
	"bytes"
	"fmt"
	"io"
	"io/ioutil"
	"net/http"
	"os"
	"path/filepath"
	"strings"
)

// The number of possible binary formats is very large
// items that might be checked into a repo or be an
// artifact of a build.  Additions welcome.
//
// Golang's internal table is very small and can't be
// relied on.  Even then things like ".js" have a mime
// type of "application/javascipt" which isn't very helpful.
// "[x]" means we have  sniff test and suffix test should be eliminated
var binary = map[string]bool{
	".a":     true, // [ ] archive
	".bin":   true, // [ ] binary
	".bz2":   true, // [ ] compression
	".class": true, // [x] Java class file
	".dll":   true, // [ ] shared library
	".exe":   true, // [ ] binary
	".gif":   true, // [ ] image
	".gpg":   true, // [x] text, but really all base64
	".gz":    true, // [ ] compression
	".ico":   true, // [ ] image
	".jar":   true, // [x] archive
	".jpeg":  true, // [ ] image
	".jpg":   true, // [ ] image
	".mp3":   true, // [ ] audio
	".mp4":   true, // [ ] video
	".mpeg":  true, // [ ] video
	".o":     true, // [ ] object file
	".pdf":   true, // [x] pdf
	".png":   true, // [x] image
	".pyc":   true, // [ ] Python bytecode
	".pyo":   true, // [ ] Python bytecode
	".so":    true, // [x] shared library
	".swp":   true, // [ ] vim swap file
	".tar":   true, // [ ] archive
	".tiff":  true, // [ ] image
	".woff":  true, // [ ] font
	".woff2": true, // [ ] font
	".xz":    true, // [ ] compression
	".z":     true, // [ ] compression
	".zip":   true, // [x] archive
}

// isBinaryFilename returns true if the file is likely to be binary
//
// Better heuristics could be done here, in particular a binary
// file is unlikely to be UTF-8 encoded.  However this is cheap
// and will solve the immediate need of making sure common
// binary formats are not corrupted by mistake.
func isBinaryFilename(s string) bool {
	return binary[strings.ToLower(filepath.Ext(s))]
}

var scm = map[string]bool{
	".bzr": true,
	".git": true,
	".hg":  true,
	".svn": true,
	"CVS":  true,
}

// isSCMPath returns true if the path is likely part of a (private) SCM
//  directory.  E.g.  ./git/something  = true
func isSCMPath(s string) bool {
	// hack for .git/COMMIT_EDITMSG and .git/TAG_EDITMSG
	// normally we don't look at anything in .git
	// but COMMIT_EDITMSG and TAG_EDITMSG are used as
	// temp files for git commits.  Allowing misspell to inspect
	// these files allows for commit-msg hooks
	// https://git-scm.com/book/en/v2/Customizing-Git-Git-Hooks
	if strings.Contains(filepath.Base(s), "EDITMSG") {
		return false
	}
	parts := strings.Split(filepath.Clean(s), string(filepath.Separator))
	for _, dir := range parts {
		if scm[dir] {
			return true
		}
	}
	return false
}

var magicHeaders = [][]byte{
	// Issue #68
	// PGP messages and signatures are "text" but really just
	// blobs of base64-text and should not be misspell-checked
	[]byte("-----BEGIN PGP MESSAGE-----"),
	[]byte("-----BEGIN PGP SIGNATURE-----"),

	// ELF
	{0x7f, 0x45, 0x4c, 0x46},

	// Postscript
	{0x25, 0x21, 0x50, 0x53},

	// PDF
	{0x25, 0x50, 0x44, 0x46},

	// Java class file
	// https://en.wikipedia.org/wiki/Java_class_file
	{0xCA, 0xFE, 0xBA, 0xBE},

	// PNG
	// https://en.wikipedia.org/wiki/Portable_Network_Graphics
	{0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a},

	// ZIP, JAR, ODF, OOXML
	{0x50, 0x4B, 0x03, 0x04},
	{0x50, 0x4B, 0x05, 0x06},
	{0x50, 0x4B, 0x07, 0x08},
}

func isTextFile(raw []byte) bool {
	for _, magic := range magicHeaders {
		if bytes.HasPrefix(raw, magic) {
			return false
		}
	}

	// allow any text/ type with utf-8 encoding
	// DetectContentType sometimes returns charset=utf-16 for XML stuff
	//  in which case ignore.
	mime := http.DetectContentType(raw)
	return strings.HasPrefix(mime, "text/") && strings.HasSuffix(mime, "charset=utf-8")
}

// ReadTextFile returns the contents of a file, first testing if it is a text file
//  returns ("", nil) if not a text file
//  returns ("", error) if error
//  returns (string, nil) if text
//
// unfortunately, in worse case, this does
//   1 stat
//   1 open,read,close of 512 bytes
//   1 more stat,open, read everything, close (via ioutil.ReadAll)
//  This could be kinder to the filesystem.
//
// This uses some heuristics of the file's extension (e.g. .zip, .txt) and
// uses a sniffer to determine if the file is text or not.
// Using file extensions isn't great, but probably
// good enough for real-world use.
// Golang's built in sniffer is problematic for differnet reasons.  It's
// optimized for HTML, and is very limited in detection.  It would be good
// to explicitly add some tests for ELF/DWARF formats to make sure we never
// corrupt binary files.
func ReadTextFile(filename string) (string, error) {
	if isBinaryFilename(filename) {
		return "", nil
	}

	if isSCMPath(filename) {
		return "", nil
	}

	fstat, err := os.Stat(filename)

	if err != nil {
		return "", fmt.Errorf("Unable to stat %q: %s", filename, err)
	}

	// directory: nothing to do.
	if fstat.IsDir() {
		return "", nil
	}

	// avoid reading in multi-gig files
	// if input is large, read the first 512 bytes to sniff type
	// if not-text, then exit
	isText := false
	if fstat.Size() > 50000 {
		fin, err := os.Open(filename)
		if err != nil {
			return "", fmt.Errorf("Unable to open large file %q: %s", filename, err)
		}
		defer fin.Close()
		buf := make([]byte, 512)
		_, err = io.ReadFull(fin, buf)
		if err != nil {
			return "", fmt.Errorf("Unable to read 512 bytes from %q: %s", filename, err)
		}
		if !isTextFile(buf) {
			return "", nil
		}

		// set so we don't double check this file
		isText = true
	}

	// read in whole file
	raw, err := ioutil.ReadFile(filename)
	if err != nil {
		return "", fmt.Errorf("Unable to read all %q: %s", filename, err)
	}

	if !isText && !isTextFile(raw) {
		return "", nil
	}
	return string(raw), nil
}