; ; ------------------------------------------------------------ ; ; PureBasic - UTF8-String example file ; ; (c) 2018 - Fantaisie Software ; ; ------------------------------------------------------------ ; ; This example show how to guess if a raw string is an UTF8-String Procedure seems_utf8(*StrMem, iLen.i) ;Return #True (1) if a string in memory (buffer) is an UTF8-String, #False (0) otherwise ; Some var needed Protected iCnt.i Protected aCharVal.a Protected iNext.i Protected iCnt2.i Protected iUtf8Detected.i = #False ; get length, For utf8 this means bytes and not characters ; we need to check each byte in the string For iCnt=0 To iLen-1 ; get the byte code 0-255 of the i-th byte aCharVal = PeekA(*StrMem+iCnt) ; Debug "Cnt >"+iCnt+"< CharVal >"+aCharVal+"< - >"+Right("00000000"+Bin(aCharVal, #PB_Ascii),8)+"< Char>"+Chr(aCharVal)+"<" ; # utf8 characters can take 1-6 bytes, how much ; # exactly is decoded in the first character If ; # it has a character code >= 128 (highest bit set). ; # For all <= 127 the ASCII is the same As UTF8. ; # The number of bytes per character is stored in ; # the highest bits of the first byte of the UTF8 ; # character. The bit pattern that must be matched ; # For the different length are shown As comment. ; # ; # So $n will hold the number of additonal characters If (aCharVal < $80) : iNext = 0 ; # 0bbbbbbb ElseIf ((aCharVal & $E0) = $C0) : iNext=1; # 110bbbbb ElseIf ((aCharVal & $F0) = $E0) : iNext=2; # 1110bbbb ElseIf ((aCharVal & $F8) = $F0) : iNext=3; # 11110bbb ElseIf ((aCharVal & $FC) = $F8) : iNext=4; # 111110bb ElseIf ((aCharVal & $FE) = $FC) : iNext=5; # 1111110b Else ; Debug "error >"+iCnt+"< iLen >"+iLen+"< " ProcedureReturn #False; # Does not match any model EndIf ; # the code now checks the following additional bytes ; # First in the If checks that the byte is really inside the ; # string And running over the string End. ; # The second just check that the highest two bits of all ; # additonal bytes are always 1 And 0 (hexadecimal 0x80) ; # which is a requirement For all additional UTF-8 bytes If iNext>0 ; Debug "Next >"+iNext+"<" iUtf8Detected=#True For iCnt2=1 To iNext ;# n bytes matching 10bbbbbb follow ? If iCnt+iCnt2>iLen Or (PeekA(*StrMem+iCnt+iCnt2) & $C0)<>$80 ; Debug "false iCnt >"+iCnt+"< iCnt2>"+iCnt2+"< iLen >"+iLen+"< iNext >"+iNext+"< CHAR >"+PeekA(*StrMem+iCnt) ProcedureReturn #False EndIf Next iCnt+iCnt2-1 EndIf Next ProcedureReturn iUtf8Detected EndProcedure unicode.s="Hélé" ; Since PB 5.5, all strings are created in unicode by default *Mem0=AllocateMemory(4) PokeS(*Mem0,"Hélé",4, #PB_UTF8) ; Fills a buffer with an UTF8-String *Mem1 = Ascii("Hélé") ; Fills a buffer with an ASCII-String *Mem2 = UTF8("Hélé") ; Fills a buffer With an UTF8-String Debug seems_utf8(@"Hélé", 4) ; displays #false because it's an unicode-string Debug seems_utf8(@unicode, 4) ; displays #false because it's an unicode-string Debug seems_utf8(*Mem0, 4) ; displays #true because it's an UTF8-string Debug seems_utf8(*Mem1, 4) ; displays #false because it's an ASCII-string Debug seems_utf8(*Mem2, 4) ; displays #true because it's an UTF8-string PokeS(*Mem0,"Hele", 4, #PB_UTF8) ; Fills a buffer with an UTF8-String Debug seems_utf8(*Mem0, 4) ; displays #false because the buffer is filled with ascii characters only (0 to $80=127) ; then it's not possible to discriminate an ascii-string of an utf8-string PokeS(*Mem0,"Hé Hele", 7, #PB_UTF8) ; Fills a buffer with an UTF8-String Debug seems_utf8(*Mem0, 7) ; displays #true because because it's an UTF8-string