Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

""" 

Something 

""" 

 

import sys, os 

import re 

import logging 

import shutil 

import time 

import glob 

 

from threading import Lock 

 

from watchdog.observers import Observer 

from watchdog.events import LoggingEventHandler 

from watchdog.events import FileSystemEventHandler 

 

 

class PyPdfWatcher(FileSystemEventHandler): 

    """ 

        Watch a folder for new pdf files. 

 

        If new file event, then add it to queue with timestamp. 

        If file mofified event, then change timestamp in queue. 

        Every few seconds pop-off queue and if timestamp older than 3 seconds, 

        process the file else, push it back onto queue. 

    """ 

    events = {} 

    events_lock = Lock() 

 

    def __init__(self, monitor_dir, config): 

        FileSystemEventHandler.__init__(self) 

 

        self.monitor_dir = monitor_dir 

        if not config: config = {} 

 

        self.scan_interval = config.get('scan_interval', 3) # If no updates in 3 seconds (or user specified option in config file) process file 

 

    def start(self): 

        self.observer = Observer() 

        self.observer.schedule(self, self.monitor_dir) 

        self.observer.start() 

        print("Starting to watch for new pdfs in %s" % (self.monitor_dir)) 

        while True: 

            logging.info("Sleeping for %d seconds" % self.scan_interval) 

            time.sleep(self.scan_interval) 

            newFile = self.check_queue() 

            if newFile: 

                yield newFile 

        self.observer.join() 

 

 

    def stop(self): 

        self.observer.stop() 

 

    def rename_file_with_spaces(self, pdf_filename): 

        """ 

            Rename any portion of a filename that has spaces in the basename with underscores. 

            Does not affect spaces in the directory path. 

 

            :param pdf_filename: Filename to remove spaces 

            :type pdf_filename: string 

            :returns: Modified filename 

            :rtype: string 

        """ 

        filepath, filename = os.path.split(pdf_filename) 

        if ' ' in filename: 

            newFilename = os.path.join(filepath, filename.replace(' ','_')) 

            logging.debug("Renaming spaces") 

            logging.debug("---> %s \n ------> %s" % (pdf_filename, newFilename)) 

            shutil.move(pdf_filename, newFilename) 

            return newFilename 

        else: 

            return pdf_filename 

 

    def check_for_new_pdf(self,ev_path): 

        """ 

            Called by the file watching api on any file creations/modifications. 

            For any file ending with ".pdf", but not "_ocr.pdf", it adds new files 

            to the event queue with the current time stamp, or it updates existing files in 

            the queue with the current timestamp.  This queue is used to track files and 

            keep track of their last "touched" time, so we can start processing a file if 

            :func:`check_queue` finds a file that hasn't been touched in a while. 

 

            If the file does note exist in the events dict: 

 

                - Add it with the current time 

 

            Otherwise: 

                 

                - If the file time is marked as -1, delete it from the dict 

                - Else, update the time in the dict to the current time 

 

        """ 

        if ev_path.endswith(".pdf"): 

            if not ev_path.endswith(("_ocr.pdf", "_test.pdf")): 

                PyPdfWatcher.events_lock.acquire() 

                if not ev_path in PyPdfWatcher.events: 

                    PyPdfWatcher.events[ev_path] = time.time() 

                    logging.info ("Adding %s to event queue" % ev_path) 

                else: 

                    if PyPdfWatcher.events[ev_path] == -1: 

                        logging.info ( "%s removing from event queue" % (ev_path)) 

                        del PyPdfWatcher.events[ev_path] 

                    else: 

                        newTime = time.time() 

                        logging.debug ( "%s already in event queue, updating timestamp to %d" % (ev_path, newTime)) 

                        PyPdfWatcher.events[ev_path]  = newTime 

                PyPdfWatcher.events_lock.release() 

 

 

 

    def on_created(self, event): 

        logging.debug ("on_created: %s at time %d" % (event.src_path, time.time())) 

        self.check_for_new_pdf(event.src_path) 

 

    def on_moved(self, event): 

        logging.debug ("on_moved: %s" % event.src_path) 

        self.check_for_new_pdf(event.dest_path) 

 

    def on_modified(self, event): 

        logging.debug ("on_modified: %s" % event.src_path) 

        self.check_for_new_pdf(event.src_path) 

 

    def check_queue(self): 

        """ 

            This function is called at regular intervals by :func:`start`. 

             

            Iterate through the events, and if there is any with a timestamp 

            greater than the scan_interval, return it and set its timestamp to -1 

            for purging later. 

 

            :returns: Filename if available to process, otherwise None. 

        """ 

        now = time.time() 

        PyPdfWatcher.events_lock.acquire() 

        for monitored_file, timestamp in PyPdfWatcher.events.items(): 

            if timestamp == -1: 

                del PyPdfWatcher.events[monitored_file] 

            elif now - timestamp > self.scan_interval: 

                logging.info("Processing new file %s" % (monitored_file)) 

                # Remove this file from the dict 

                del PyPdfWatcher.events[monitored_file] 

                monitored_file = self.rename_file_with_spaces(monitored_file) 

                PyPdfWatcher.events[monitored_file] = -1 # Add back into queue and mark as not needing further action in the event handler 

                PyPdfWatcher.events_lock.release() 

                return monitored_file 

        PyPdfWatcher.events_lock.release() 

        return None