Advertisement
Mysoft

Untitled

Nov 22nd, 2016
685
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #include "crt.bi"
  2.  
  3. enum BomType
  4.   btUnknown
  5.   btAscii
  6.   btUTF8
  7.   btUTF16le
  8.   btUTF16be
  9.   btUTF32le
  10.   btUTF32be
  11. end enum
  12.  
  13. dim as string sFilename = "filetest.txt"
  14. dim as integer f = freefile()
  15.  
  16. if open(sFilename for binary access read as #f) then
  17.   print "Failed to open '"+sFilename+"'"
  18.   sleep: system
  19. end if
  20.  
  21. dim as longint iFileSz = lof(f),iStart=0
  22. if iFileSz <= 8 or iFileSz > 256*1024*1024 then
  23.   print "bad file size for detection: " & iFileSz
  24.   close #1: sleep: system
  25. end if
  26.  
  27. 'reading file on memory
  28. dim as ubyte ptr pFile = allocate(iFileSz+1)
  29. get #f,,*pFile,iFileSz
  30. close #f
  31.  
  32. 'step 1: the file has a bom?
  33. dim as integer iBomSz = 0, iBomType = btUnknown
  34. print "Type: ";
  35. if *cptr(ulong ptr, pFile) = &hFFFE0000 then
  36.   print "(bom) UTF-32 (Big endian)"
  37.   iBomSz = 4: iBomType = btUTF32be
  38. elseif *cptr(ulong ptr, pFile) = &h0000FEFF then
  39.   print "(bom) UTF-32 (Little endian)"
  40.   iBomSz = 4: iBomType = btUTF32le
  41. elseif (*cptr(ulong ptr, pFile) and &hFFFFFF) = &hBFBBEF then
  42.   print "(bom) UTF-8"
  43.   iBomSz = 3: iBomType = btUTF8
  44. elseif *cptr(ushort ptr, pFile) = &hFFFE then
  45.   print "(bom) UTF-16 (Big Endian)"
  46.   iBomSz = 2: iBomType = btUTF16be
  47. elseif *cptr(ushort ptr, pFile) = &hFEFF then
  48.   print "(bom) UTF-16 (Little Endian)"
  49.   iBomSz = 2: iBomType = btUTF16le
  50. end if
  51.  
  52. #if 0 'forcing autodetection...
  53. if iBomType <> btUnknown then
  54.   print "Autodetected: ";
  55.   iBomType = btUnknown
  56. end if
  57. #endif
  58.  
  59. if iBomType = btUnknown then
  60.   'still unknown so let's see how
  61.   dim as integer iCount,iZeroNone=iBomSz,iZeroOne,iZeroTwo,iZeroMore
  62.   dim as integer iZeroCount,iZeroHigh,iZeroStreak
  63.   dim as integer iZeroOdd,iZeroEven,iZeroOddW,iZeroEvenW, iLast = -1
  64.  
  65.   'to make last byte finish the count either way
  66.   if pFile[iFileSz-1]=0 then pFile[iFileSz]=1 else pFile[iFileSz]=0: iZeroCount = -1
  67.  
  68.   'now let's grab statistics
  69.   for N as integer = 0 to iFileSz
  70.     var iChar = cint(pFile[N])
  71.     if iChar=0 then
  72.       iZeroCount += 1
  73.       if (N and 1) then iZeroOdd += 1: else iZeroEven += 1    
  74.       if iLast=0 then
  75.         iCount += 1
  76.       else
  77.         if iCount > 0 then iZeroNone += iCount        
  78.         iCount = 1: iLast=0
  79.       end if
  80.     else
  81.       if iChar > 127 then iZeroHigh += 1
  82.       if iLast=1 then
  83.         iCount += 1
  84.       else
  85.         select case iCount
  86.         case 0:    'nothing
  87.         case 1:    iZeroOne += 1
  88.         case 2:    iZeroTwo += 1: if (N and 3) > 1 then iZeroOddW += 1 else iZeroEvenW += 1
  89.         case else: iZeroMore += 1: iZeroStreak += iCount
  90.         end select
  91.         iCount = 1: iLast=1
  92.       end if
  93.     end if
  94.   next N
  95.  
  96.   if iZeroNone = iFileSz and iZeroHigh=0 then
  97.     print "Ascii": iBomType = btAscii
  98.   elseif iZeroCount < iFileSz\4 then
  99.     if iZeroHigh > iZeroCount then
  100.       print "UTF-8 or Ansi": iBomType = btUtf8
  101.     end if
  102.   else
  103.     if iZeroCount > iFileSz*.66 then 'UTF32
  104.       if iZeroTwo*2 > iZeroStreak then
  105.         if iZeroOddW > iZeroEvenW then
  106.           print "UTF32-LE": iBomType = btUTF32le
  107.         else
  108.           print "UTF32-BE": iBomType = btUTF32be
  109.         end if      
  110.       end if
  111.     else
  112.       if iZeroOne > iZeroTwo and (iZeroOne*2) > iZeroStreak then 'UTF16
  113.         if iZeroOdd > iZeroEven then
  114.           print "UTF16-LE": iBomType = btUTF16le
  115.         else
  116.           print "UTF32-BE": iBomType = btUTF16be
  117.         end if
  118.       end if
  119.     end if
  120.   end if
  121.  
  122.   if iBomType = btUnknown then print "Binary"
  123.  
  124. end if
  125.  
  126. ' looking for non ascii chars
  127. print
  128. if iBomType <> btUTF16LE then
  129.   print "This is only for UTF16LE"
  130.   sleep: system
  131. end if
  132.  
  133. sub ShowLine( pStart as ushort ptr , pEnd as ushort ptr , iLineNum as integer )
  134.   color 11: printf "Line %05i: '", iLineNum
  135.   while pStart <= pEnd
  136.     select case *pStart
  137.     case 9,32 to 127 'exceptions again (without 13)
  138.       color 8,0
  139.     case else
  140.       color 12,1
  141.     end select
  142.     print wchr(*pStart);
  143.     pStart += 1
  144.   wend
  145.   color 11,0: print "'"
  146. end sub
  147.  
  148. 'starting right after the bom
  149. var pLine = cast(ushort ptr,pFile+iBomSz), pChar = pLine, iLineNum = 1 , HasInvalid = 0
  150. 'processing chars for invalid or non ascii... or line endings
  151. while cuint(pChar) < cuint(pFile+iFileSz)
  152.   select case *pChar
  153.   case 9,13,32 to 127 'ascii and <32 Exceptions (13 is part of line end but i ignore here)
  154.     pChar += 1
  155.   case 10             'end of line
  156.     if HasInvalid then 'show line number
  157.       var pEnd = pChar-1      
  158.       if *pEnd = 13 then pEnd -= 1 'was CRLF so minus 1
  159.       ShowLine( pLine , pEnd , iLineNum )
  160.       'printing (dup code? =x)
  161.     end if
  162.     pChar += 1: iLineNum += 1:
  163.     pLine = pChar: HasInvalid = 0
  164.   case else
  165.     HasInvalid = 1: pChar += 1
  166.   end select
  167. wend
  168. if HasInvalid then ShowLine(pLine,pChar,iLineNum)
  169.  
  170. color 7: print "Done."
  171. sleep
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement