Attribute VB_Name = "Module1" Option Explicit Sub COUNTCHRS() Dim Myline As String, n As Long, m As Long, mychar As String, i As Long, j As Long Dim MyVal As Long, MYpath As String, Language As String, MMout(65536) As Integer, MMoutChr(65536) As String, MMoutCount(65536) As Long MYpath = ThisWorkbook.Path MYpath = StrReverse(MYpath) i = InStr(1, MYpath, "\", vbTextCompare) Language = StrReverse(Left$(MYpath, i - 1)) ' The following code assumes the subject language is the same as the name of the directory where this file has been stored. ' It further assumes that a master file with the same name containing one or more e-books in that language has been also stored ' in that directory. Directory : English, Master file: English.txt ' For languages such as Arabic or Russian (with real funny characters), the master file should use the .html format that supports ' unicode character (2 bytes per character). MYpath = StrReverse(MYpath) & "\" & Language & ".txt" Sheets("Sheet2").Range("A2:C65536").ClearContents m = FreeFile Open MYpath For Input As #m Do While Not EOF(m) ' Stop at end of file Line Input #m, Myline ' get one line of text ' store the ascii code, the character itself, and the count in the appropriate array variable. ' This is far faster than storing this information directly on Sheet2 For i = 0 To Len(Myline) - 1 mychar = UCase(Mid$(Myline, i + 1, 1)) MyVal = AscW(mychar) ' get the ascii code for two bytes wide characters, if applicable. The Asc function only returns ' the value of the second byte MMout(MyVal) = MyVal MMoutChr(MyVal) = mychar MMoutCount(MyVal) = MMoutCount(MyVal) + 1 Next i Loop Close #m ' We are done counting. Now, store the results from the arrays to the spreadsheet With Sheets("SHEET2").Range("a1") j = 1 For i = 0 To 65535 If MMoutCount(i) > 0 Then .Offset(j, 0) = MMout(i) .Offset(j, 1) = MMoutChr(i) .Offset(j, 2) = MMoutCount(i) j = j + 1 End If Next i End With End Sub Function NoAccent(strMM As String) As String ' this function assumes the input string is upper case ' which is done in the toUC macro. Dim i As Long, j As Integer i = 1 If strMM = "" Then NoAccent = "" Exit Function End If Do j = AscW(Mid$(strMM, i, 1)) ' AscW correctly handles 2 byte wide unicode characters Select Case j Case 8364 ' UNICODE character - The standard ASCII table goes from 0 to 255. strMM = Replace(strMM, "€", "EURO") ' For Euro zone, including most of Europe ' Case 196 ' strMM = Replace(strMM, "Ä", "AE") ' For the German language, by ACA convention ' Case 214 ' strMM = Replace(strMM, "Ö", "OE") ' For the German language, by ACA convention ' Case 220 ' strMM = Replace(strMM, "Ü", "UE") ' For the German language, by ACA convention Case 223 strMM = Replace(strMM, "ß", "SS") ' For the German language ' There are many other symbols, some of which will require special habdling. Case 338 ' Unicode strMM = Replace(strMM, "Œ", "OE") ' Note that StrMM must be redimensioned, using Replace is slower Case 198 strMM = Replace(strMM, "Æ", "AE") Case 65 To 90 ' A to Z, do not change Case 212 ' Ô Mid$(strMM, i, 1) = "O" ' No redimension, therefore faster Case 201, 203 Mid$(strMM, i, 1) = "E" ' É, Ë Case 199 ' Ç Mid$(strMM, i, 1) = "C" Case Else Mid$(strMM, i, 1) = "[" ' Chr (91), for spaces and everything else End Select i = i + 1 If i > Len(strMM) Then Exit Do Loop NoAccent = strMM End Function Sub toUC() Dim Myline As String, n As Long, m As Long Dim Lineout As String, i As Long, j As Long Dim MYpath As String, Language As String, Mypath2 As String MYpath = ThisWorkbook.Path MYpath = StrReverse(MYpath) i = InStr(1, MYpath, "\", vbTextCompare) Language = StrReverse(Left$(MYpath, i - 1)) Mypath2 = StrReverse(MYpath) & "\" & Language & "UC.txt" ' specifies the output file MYpath = StrReverse(MYpath) & "\" & Language & ".txt" ' Input data m = FreeFile Open MYpath For Input As #m n = FreeFile Open Mypath2 For Output As #n j = 0 Do While Not EOF(m) j = j + 1 Line Input #m, Myline Lineout = NoAccent(UCase(Myline)) ' Next, remove extra word dividers Again: If InStr(1, Lineout, "[[") > 0 Then Lineout = Replace(Lineout, "[[", "["): GoTo Again ' Remove word dividers from start or end of line If Right$(Lineout, 1) = "[" Then Lineout = Left$(Lineout, Len(Lineout) - 1) If Left$(Lineout, 1) = "[" Then Lineout = Right$(Lineout, Len(Lineout) - 1) ' Output if there is anything left If Len(Lineout) > 0 Then Print #n, Lineout Application.StatusBar = j Loop Close #m Close #n Application.StatusBar = False End Sub Sub MakeTetrasSpace() Dim n As Long, Myline As String, MyL As String Dim i As Long, x(200) As Byte, T As String, myMax As Long Dim a(27 ^ 4) As Long, ptr As Long Dim zz As Byte Dim MYpath As String, Language As String, Mypath2 As String MYpath = ThisWorkbook.Path MYpath = StrReverse(MYpath) i = InStr(1, MYpath, "\", vbTextCompare) Language = StrReverse(Left$(MYpath, i - 1)) ' Allocate the output file Mypath2 = StrReverse(MYpath) & "\TetLogSpace_" & Language & ".txt" ' Specify the input file MYpath = StrReverse(MYpath) & "\" & Language & "UC.txt" n = FreeFile Open MYpath For Input As #n MyL = "" Do While Not EOF(n) ' stop if end of file Line Input #n, Myline ' get a line of text MyL = MyL + Chr(91) + Myline ' and append it to MyL MyL = Replace(MyL, Chr(91) & Chr(91), Chr(91)) ' Get rid of surplus word dividers, if any ' convert characters to numbers, where A =0, B =2, .., [ =27, and store in array x() For i = 0 To Len(MyL) - 1 x(i) = Asc(Mid$(MyL, i + 1, 1)) - 65 Next i ' calculate the array pointer ptr = 27 * 27 * x(0) + 27 * x(1) + x(2) ' the first three characters For i = 3 To Len(MyL) - 4 ' Here, we calculate the pointers using a faster method ptr = 27 * (ptr Mod 27 * 27 * 27) + x(i) ' using Mod is much faster than repeating the full calculation a(ptr) = a(ptr) + 1 ' given the pointer, increment the count by 1 Next i ' go process the next tetragram, if nany MyL = Right$(MyL, 3) ' we now have only 3 characters left, so go append the next line of text Loop Close #n ' we have reached the end of file, so close it myMax = 0 ' now, let's create the tetlog file n = FreeFile Open Mypath2 For Output As #n For i = 0 To 27 ^ 4 - 1 ' and load the data If a(i) = 0 Then zz = 0 ' Log(0) would cause an error Else zz = CByte(1 + Log(a(i))) ' if not zero, get the log, and add 1 End If Write #n, zz; ' store either zero or the log of count, plus 1 Next i Write #n, Close #n End Sub Sub MakeTetras() 'This is identical to the previous routine, except we get rid of the word dividers altogether Dim n As Long, Myline As String, MyL As String Dim i As Long, x(200) As Byte, T As String, myMax As Long Dim a(26 ^ 4) As Long, ptr As Long Dim zz As Byte Dim MYpath As String, Language As String, Mypath2 As String MYpath = ThisWorkbook.Path MYpath = StrReverse(MYpath) i = InStr(1, MYpath, "\", vbTextCompare) Language = StrReverse(Left$(MYpath, i - 1)) Mypath2 = StrReverse(MYpath) & "\TetLog_" & Language & ".txt" MYpath = StrReverse(MYpath) & "\" & Language & "UC.txt" n = FreeFile Open MYpath For Input As #n MyL = "" Do While Not EOF(n) Line Input #n, Myline MyL = MyL + Myline MyL = Replace(MyL, Chr(91), "") ' No word dividers For i = 0 To Len(MyL) - 1 x(i) = Asc(Mid$(MyL, i + 1, 1)) - 65 Next i ptr = 26 * 26 * x(0) + 26 * x(1) + x(2) For i = 3 To Len(MyL) - 4 ptr = 26 * (ptr Mod 26 * 26 * 26) + x(i) a(ptr) = a(ptr) + 1 Next i MyL = Right$(MyL, 3) Loop Close #n myMax = 0 n = FreeFile Open Mypath2 For Output As #n For i = 0 To 26 ^ 4 - 1 If a(i) = 0 Then zz = 0 Else zz = CByte(1 + Log(a(i))) End If Write #n, zz; Next i Write #n, Close #n End Sub Function Wasc(x As String) As Long ' This function converts a unicode character to its ascii equivalent Wasc = AscW(x) End Function Function Wchar(x As Long) As String ' This function converts a number 0<=x<(256*256) to a unicode character Wchar = ChrW(x) End Function