Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include "crt.bi"
- enum BomType
- btUnknown
- btAscii
- btUTF8
- btUTF16le
- btUTF16be
- btUTF32le
- btUTF32be
- end enum
- dim as string sFilename = "filetest.txt"
- dim as integer f = freefile()
- if open(sFilename for binary access read as #f) then
- print "Failed to open '"+sFilename+"'"
- sleep: system
- end if
- dim as longint iFileSz = lof(f),iStart=0
- if iFileSz <= 8 or iFileSz > 256*1024*1024 then
- print "bad file size for detection: " & iFileSz
- close #1: sleep: system
- end if
- 'reading file on memory
- dim as ubyte ptr pFile = allocate(iFileSz+1)
- get #f,,*pFile,iFileSz
- close #f
- 'step 1: the file has a bom?
- dim as integer iBomSz = 0, iBomType = btUnknown
- print "Type: ";
- if *cptr(ulong ptr, pFile) = &hFFFE0000 then
- print "(bom) UTF-32 (Big endian)"
- iBomSz = 4: iBomType = btUTF32be
- elseif *cptr(ulong ptr, pFile) = &h0000FEFF then
- print "(bom) UTF-32 (Little endian)"
- iBomSz = 4: iBomType = btUTF32le
- elseif (*cptr(ulong ptr, pFile) and &hFFFFFF) = &hBFBBEF then
- print "(bom) UTF-8"
- iBomSz = 3: iBomType = btUTF8
- elseif *cptr(ushort ptr, pFile) = &hFFFE then
- print "(bom) UTF-16 (Big Endian)"
- iBomSz = 2: iBomType = btUTF16be
- elseif *cptr(ushort ptr, pFile) = &hFEFF then
- print "(bom) UTF-16 (Little Endian)"
- iBomSz = 2: iBomType = btUTF16le
- end if
- #if 0 'forcing autodetection...
- if iBomType <> btUnknown then
- print "Autodetected: ";
- iBomType = btUnknown
- end if
- #endif
- if iBomType = btUnknown then
- 'still unknown so let's see how
- dim as integer iCount,iZeroNone=iBomSz,iZeroOne,iZeroTwo,iZeroMore
- dim as integer iZeroCount,iZeroHigh,iZeroStreak
- dim as integer iZeroOdd,iZeroEven,iZeroOddW,iZeroEvenW, iLast = -1
- 'to make last byte finish the count either way
- if pFile[iFileSz-1]=0 then pFile[iFileSz]=1 else pFile[iFileSz]=0: iZeroCount = -1
- 'now let's grab statistics
- for N as integer = 0 to iFileSz
- var iChar = cint(pFile[N])
- if iChar=0 then
- iZeroCount += 1
- if (N and 1) then iZeroOdd += 1: else iZeroEven += 1
- if iLast=0 then
- iCount += 1
- else
- if iCount > 0 then iZeroNone += iCount
- iCount = 1: iLast=0
- end if
- else
- if iChar > 127 then iZeroHigh += 1
- if iLast=1 then
- iCount += 1
- else
- select case iCount
- case 0: 'nothing
- case 1: iZeroOne += 1
- case 2: iZeroTwo += 1: if (N and 3) > 1 then iZeroOddW += 1 else iZeroEvenW += 1
- case else: iZeroMore += 1: iZeroStreak += iCount
- end select
- iCount = 1: iLast=1
- end if
- end if
- next N
- if iZeroNone = iFileSz and iZeroHigh=0 then
- print "Ascii": iBomType = btAscii
- elseif iZeroCount < iFileSz\4 then
- if iZeroHigh > iZeroCount then
- print "UTF-8 or Ansi": iBomType = btUtf8
- end if
- else
- if iZeroCount > iFileSz*.66 then 'UTF32
- if iZeroTwo*2 > iZeroStreak then
- if iZeroOddW > iZeroEvenW then
- print "UTF32-LE": iBomType = btUTF32le
- else
- print "UTF32-BE": iBomType = btUTF32be
- end if
- end if
- else
- if iZeroOne > iZeroTwo and (iZeroOne*2) > iZeroStreak then 'UTF16
- if iZeroOdd > iZeroEven then
- print "UTF16-LE": iBomType = btUTF16le
- else
- print "UTF32-BE": iBomType = btUTF16be
- end if
- end if
- end if
- end if
- if iBomType = btUnknown then print "Binary"
- end if
- ' looking for non ascii chars
- print
- if iBomType <> btUTF16LE then
- print "This is only for UTF16LE"
- sleep: system
- end if
- sub ShowLine( pStart as ushort ptr , pEnd as ushort ptr , iLineNum as integer )
- color 11: printf "Line %05i: '", iLineNum
- while pStart <= pEnd
- select case *pStart
- case 9,32 to 127 'exceptions again (without 13)
- color 8,0
- case else
- color 12,1
- end select
- print wchr(*pStart);
- pStart += 1
- wend
- color 11,0: print "'"
- end sub
- 'starting right after the bom
- var pLine = cast(ushort ptr,pFile+iBomSz), pChar = pLine, iLineNum = 1 , HasInvalid = 0
- 'processing chars for invalid or non ascii... or line endings
- while cuint(pChar) < cuint(pFile+iFileSz)
- select case *pChar
- case 9,13,32 to 127 'ascii and <32 Exceptions (13 is part of line end but i ignore here)
- pChar += 1
- case 10 'end of line
- if HasInvalid then 'show line number
- var pEnd = pChar-1
- if *pEnd = 13 then pEnd -= 1 'was CRLF so minus 1
- ShowLine( pLine , pEnd , iLineNum )
- 'printing (dup code? =x)
- end if
- pChar += 1: iLineNum += 1:
- pLine = pChar: HasInvalid = 0
- case else
- HasInvalid = 1: pChar += 1
- end select
- wend
- if HasInvalid then ShowLine(pLine,pChar,iLineNum)
- color 7: print "Done."
- sleep
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement