Attribute VB_Name = "Module1"
Option Explicit
Sub COUNTCHRS()
Dim Myline As String, n As Long, m As Long, mychar As String, i As Long, j As Long Dim MyVal As Long, MYpath As String, Language As String, MMout(65536) As Integer, MMoutChr(65536) As String, MMoutCount(65536) As Long MYpath = ThisWorkbook.Path MYpath = StrReverse(MYpath) i = InStr(1, MYpath, "\", vbTextCompare) Language = StrReverse(Left$(MYpath, i - 1)) ' The following code assumes the subject language is the same as the name of the directory where this file has been stored.
' It further assumes that a master file with the same name containing one or more e-books in that language has been also stored ' in that directory. Directory : English, Master file: English.txt ' For languages such as Arabic or Russian (with real funny characters), the master file should use the .html format that supports ' unicode character (2 bytes per character).
MYpath = StrReverse(MYpath) & "\" & Language & ".txt"
Sheets("Sheet2").Range("A2:C65536").ClearContents
m = FreeFile
Open MYpath For Input As #m
Do While Not EOF(m) ' Stop at end of file Line Input #m, Myline ' get one line of text ' store the ascii code, the character itself, and the count in the appropriate array variable.
' This is far faster than storing this information directly on Sheet2 For i = 0 To Len(Myline) - 1
            mychar = UCase(Mid$(Myline, i + 1, 1))
            MyVal = AscW(mychar)    ' get the ascii code for two bytes
wide characters, if applicable. The Asc function only returns
                                    ' the value of the second byte
            MMout(MyVal) = MyVal
            MMoutChr(MyVal) = mychar
            MMoutCount(MyVal) = MMoutCount(MyVal) + 1 Next i Loop Close #m ' We are done counting. Now, store the results from the arrays to the spreadsheet With Sheets("SHEET2").Range("a1") j = 1 For i = 0 To 65535
    If MMoutCount(i) > 0 Then
            .Offset(j, 0) = MMout(i)
            .Offset(j, 1) = MMoutChr(i)
            .Offset(j, 2) = MMoutCount(i)
            j = j + 1
    End If
Next i
End With
End Sub

Function NoAccent(strMM As String) As String ' this function assumes the input string is upper case ' which is done in the toUC macro.
Dim i As Long, j As Integer
i = 1
If strMM = "" Then
NoAccent = ""
Exit Function
End If
Do
    j = AscW(Mid$(strMM, i, 1))         ' AscW correctly handles 2
byte wide unicode characters
    Select Case j
    Case 8364                           ' UNICODE character - The
standard ASCII table goes from 0 to 255.
    strMM = Replace(strMM, "€", "EURO")  ' For Euro zone, including most of Europe
    ' Case 196
    ' strMM = Replace(strMM, "Ä", "AE")  ' For the German language, by ACA convention
    ' Case 214
    ' strMM = Replace(strMM, "Ö", "OE")  ' For the German language, by ACA convention
    ' Case 220
    ' strMM = Replace(strMM, "Ü", "UE")  ' For the German language, by ACA convention
     Case 223
     strMM = Replace(strMM, "ß", "SS")  ' For the German language
    ' There are many other symbols, some of which will require special habdling.
    Case 338                            ' Unicode
    strMM = Replace(strMM, "Œ", "OE")   ' Note that StrMM must be
redimensioned, using Replace is slower
    Case 198
    strMM = Replace(strMM, "Æ", "AE")
    Case 65 To 90                       ' A to Z, do not change
    Case 212                            ' Ô
    Mid$(strMM, i, 1) = "O"             ' No redimension, therefore faster
    Case 201, 203
    Mid$(strMM, i, 1) = "E"             ' É, Ë
    Case 199                            ' Ç
    Mid$(strMM, i, 1) = "C"
    Case Else
    Mid$(strMM, i, 1) = "["             ' Chr (91), for spaces and
everything else
    End Select
    i = i + 1
    If i > Len(strMM) Then Exit Do
Loop
NoAccent = strMM
End Function

Sub toUC()
Dim Myline As String, n As Long, m As Long Dim Lineout As String, i As Long, j As Long Dim MYpath As String, Language As String, Mypath2 As String MYpath = ThisWorkbook.Path MYpath = StrReverse(MYpath) i = InStr(1, MYpath, "\", vbTextCompare) Language = StrReverse(Left$(MYpath, i - 1))
Mypath2 = StrReverse(MYpath) & "\" & Language & "UC.txt"    '
specifies the output file
MYpath = StrReverse(MYpath) & "\" & Language & ".txt"       ' Input data
m = FreeFile
Open MYpath For Input As #m
n = FreeFile
Open Mypath2 For Output As #n
j = 0
Do While Not EOF(m)
j = j + 1
Line Input #m, Myline
Lineout = NoAccent(UCase(Myline))
' Next, remove extra word dividers
Again: If InStr(1, Lineout, "[[") > 0 Then Lineout = Replace(Lineout, "[[", "["): GoTo Again ' Remove word dividers from start or end of line If Right$(Lineout, 1) = "[" Then Lineout = Left$(Lineout, Len(Lineout) - 1) If Left$(Lineout, 1) = "[" Then Lineout = Right$(Lineout, Len(Lineout) - 1) ' Output if there is anything left If Len(Lineout) > 0 Then Print #n, Lineout Application.StatusBar = j Loop Close #m Close #n Application.StatusBar = False End Sub

Sub MakeTetrasSpace()
Dim n As Long, Myline As String, MyL As String Dim i As Long, x(200) As Byte, T As String, myMax As Long Dim a(27 ^ 4) As Long, ptr As Long Dim zz As Byte Dim MYpath As String, Language As String, Mypath2 As String MYpath = ThisWorkbook.Path MYpath = StrReverse(MYpath) i = InStr(1, MYpath, "\", vbTextCompare) Language = StrReverse(Left$(MYpath, i - 1)) ' Allocate the output file
Mypath2 = StrReverse(MYpath) & "\TetLogSpace_" & Language & ".txt"
' Specify the input file
MYpath = StrReverse(MYpath) & "\" & Language & "UC.txt"
n = FreeFile
Open MYpath For Input As #n
MyL = ""
Do While Not EOF(n)             ' stop if end of file
Line Input #n, Myline           ' get a line of text
MyL = MyL + Chr(91) + Myline    ' and append it to MyL
MyL = Replace(MyL, Chr(91) & Chr(91), Chr(91))  ' Get rid of surplus word dividers, if any ' convert characters to numbers, where A =0, B =2, .., [ =27, and store in array x() For i = 0 To Len(MyL) - 1
x(i) = Asc(Mid$(MyL, i + 1, 1)) - 65
Next i
' calculate the array pointer
ptr = 27 * 27 * x(0) + 27 * x(1) + x(2)     ' the first three characters
For i = 3 To Len(MyL) - 4                   ' Here, we calculate the
pointers using a faster method
ptr = 27 * (ptr Mod 27 * 27 * 27) + x(i)    ' using Mod is much faster
than repeating the full calculation
a(ptr) = a(ptr) + 1                         ' given the pointer,
increment the count by 1
Next i                                      ' go  process the next
tetragram, if nany
MyL = Right$(MyL, 3)                        ' we now have only 3
characters left, so go append the next line of text Loop
Close #n                                    ' we have reached the end
of file, so close it
myMax = 0                                   ' now, let's create the tetlog file
n = FreeFile
Open Mypath2 For Output As #n
For i = 0 To 27 ^ 4 - 1                     ' and load the data

    If a(i) = 0 Then
        zz = 0                              ' Log(0) would cause an error
    Else
        zz = CByte(1 + Log(a(i)))           ' if not zero, get the
log, and add 1
    End If
Write #n, zz;                               ' store either zero or the
log of count, plus 1
Next i
Write #n,
Close #n
End Sub
Sub MakeTetras()
'This is identical to the previous routine, except we get rid of the word dividers altogether Dim n As Long, Myline As String, MyL As String Dim i As Long, x(200) As Byte, T As String, myMax As Long Dim a(26 ^ 4) As Long, ptr As Long Dim zz As Byte Dim MYpath As String, Language As String, Mypath2 As String MYpath = ThisWorkbook.Path MYpath = StrReverse(MYpath) i = InStr(1, MYpath, "\", vbTextCompare) Language = StrReverse(Left$(MYpath, i - 1))
Mypath2 = StrReverse(MYpath) & "\TetLog_" & Language & ".txt"
MYpath = StrReverse(MYpath) & "\" & Language & "UC.txt"
n = FreeFile
Open MYpath For Input As #n
MyL = ""
Do While Not EOF(n)
Line Input #n, Myline
MyL = MyL + Myline
MyL = Replace(MyL, Chr(91), "")         ' No word dividers
For i = 0 To Len(MyL) - 1
x(i) = Asc(Mid$(MyL, i + 1, 1)) - 65
Next i
ptr = 26 * 26 * x(0) + 26 * x(1) + x(2)
For i = 3 To Len(MyL) - 4
ptr = 26 * (ptr Mod 26 * 26 * 26) + x(i)
a(ptr) = a(ptr) + 1
Next i
MyL = Right$(MyL, 3)
Loop
Close #n
myMax = 0
n = FreeFile
Open Mypath2 For Output As #n
For i = 0 To 26 ^ 4 - 1

    If a(i) = 0 Then
        zz = 0
    Else
        zz = CByte(1 + Log(a(i)))
    End If
Write #n, zz;
Next i
Write #n,
Close #n
End Sub

Function Wasc(x As String) As Long
' This function converts a unicode character to its ascii equivalent Wasc = AscW(x) End Function Function Wchar(x As Long) As String ' This function converts a number 0<=x<(256*256) to a unicode character Wchar = ChrW(x) End Function