Project: https://github.com/edleijnse/wordDocumentReader
In this Python script you’ll find a sample how to traverse a directory with subdirectories, select Word document, extract the texts and make a new document file. At the end 2 summaries will be added: one with all the words and frequencies and another with the frequencies and words.
Sample “frequency / word”
37 we
38 about
38 from
38 this
39 B
39 C
39 D
39 YOUR
39 make
39 very
40 been
40 do
41 use
43 can
43 so
49 He
49 an
50 money
# This is a sample Python script.
from docx2python import docx2python
from collections import Counter
import os
import pandas as pd
# Press Shift+F10 to execute it or replace it with your code.
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
def extract_words(filename, outputFile):
print("---------------------------------------------------------------------------")
outputFile.write("\n------------------------------------------------------------------------------------------------\n")
print(filename)
outputFile.write(filename+"\n")
print("---------------------------------------------------------------------------")
outputFile.write("------------------------------------------------------------------------------------------------\n")
doc_result = docx2python(filename).text
print(doc_result)
outputFile.writelines(doc_result)
# extract words in a table
res = doc_result.split()
return res
def print_tallied(sorted_tallied, outputFile, title):
#print(sorted_tallied)
outputFile.write(
"\n----------------------------------------------------------------------------------------------\n")
outputFile.write(title)
outputFile.write(
"\n----------------------------------------------------------------------------------------------\n")
ii = 0
for item in sorted_tallied:
if (item[1]>0):
if (item[0].isalpha()):
print(str(item[0]) + ":" + str(item[1]))
outputFile.write(str(item[0]) + ":" + str(item[1]) + "\n")
ii = ii + 1
print("total: " + str(ii))
outputFile.write("total: " + str(ii))
def sort_panda(tallied, outputFile):
outputFile.write(
"\n----------------------------------------------------------------------------------------------\n")
outputFile.write("summary words by frequency")
outputFile.write(
"\n----------------------------------------------------------------------------------------------\n")
ii = 0
wordTab = []
countTab = []
for item in tallied:
if (item[1] > 0):
if (item[0].isalpha()):
wordTab.append(item[0])
countTab.append(item[1])
print(str(item[0]) + ":" + str(item[1]))
ii = ii + 1
data = {'Count': countTab, 'Word': wordTab}
df = pd.DataFrame(data, columns=['Count', 'Word'])
df.sort_values(by=['Count','Word'], inplace=True)
for oneValue in df.values:
print(str(oneValue[0]) + " " + oneValue[1])
outputFile.write(str(oneValue[0]) + " " + oneValue[1] + "\n")
def word_document_reader(inputDir, outputFile):
# Use a breakpoint in the code line below to debug your script.
print(f'Hi, {inputDir}') # Press Ctrl+F8 to toggle the breakpoint.
import os
total_res = []
for path, currentDirectory, files in os.walk(inputDir):
for file in files:
filename = os.path.join(path, file)
if ("doc" in filename):
total_res = total_res + extract_words(filename, outputFile)
tallied = Counter(total_res)
sorted_tallied = sorted(tallied.items())
print_tallied(sorted_tallied, outputFile, "summary of words")
sort_panda(tallied.items(),outputFile)
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
inputDir = r"f:\swissedu_attachments"
outputFile = open(r"f:\swissedu.txt", "w", encoding="utf-8")
word_document_reader(inputDir, outputFile)
outputFile.close()
# See PyCharm help at https://www.jetbrains.com/help/pycharm/