root/branches/as-branch/src/server/pvfs2-server.h @ 7851

Revision 7851, 19.8 KB (checked in by sson, 4 years ago)

Fixed a bug in bcast.sm.

Line 
1/*
2 * (C) 2001 Clemson University and The University of Chicago
3 *
4 * See COPYING in top-level directory.
5 */
6
7/*
8 *  Declarations for use in the PVFS2 server.
9 */
10
11#ifndef __PVFS2_SERVER_H
12#define __PVFS2_SERVER_H
13
14/* NOTE: STATE-MACHINE.H IS INCLUDED AT THE BOTTOM!  THIS IS SO WE CAN
15 * DEFINE ALL THE STRUCTURES WE NEED BEFORE WE INCLUDE IT.
16 */
17
18#include <stdint.h>
19#include <sys/types.h>
20#include <pwd.h>
21#include <grp.h>
22#include <string.h>
23#include "pvfs2-debug.h"
24#include "pvfs2-storage.h"
25#include "pvfs2-internal.h"
26#include "job.h"
27#include "bmi.h"
28#include "trove.h"
29#include "gossip.h"
30#include "PINT-reqproto-encode.h"
31#include "msgpairarray.h"
32#include "pvfs2-req-proto.h"
33#include "state-machine.h"
34#include "pint-event.h"
35#include "pint-segpool.h"
36
37extern job_context_id server_job_context;
38
39#define PVFS2_SERVER_DEFAULT_TIMEOUT_MS      100
40#define BMI_UNEXPECTED_OP                    999
41
42/* BMI operation timeout if not specified in config file */
43#define PVFS2_SERVER_JOB_BMI_TIMEOUT_DEFAULT         30
44/* Flow operation timeout if not specified in config file */
45#define PVFS2_SERVER_JOB_FLOW_TIMEOUT_DEFAULT        30
46/* BMI client side operation timeout if not specified in config file */
47/* NOTE: the default for this timeout is set higher to allow the client to
48 * overcome syncing and queueing delays on the server
49 */
50#define PVFS2_CLIENT_JOB_BMI_TIMEOUT_DEFAULT         300
51/* Flow client side operation timeout if not specified in config file */
52#define PVFS2_CLIENT_JOB_FLOW_TIMEOUT_DEFAULT        300
53/* maximum number of times for client to retry restartable operations;
54 * use INT_MAX to approximate infinity (187 years with 2 sec delay)
55 */
56#define PVFS2_CLIENT_RETRY_LIMIT_DEFAULT     (5)
57/* number of milliseconds that clients will delay between retries */
58#define PVFS2_CLIENT_RETRY_DELAY_MS_DEFAULT  2000
59
60/* Specifies the number of handles to be preceated at a time from each
61 * server using the batch create request.
62 */
63#define PVFS2_PRECREATE_BATCH_SIZE_DEFAULT 512
64/* precreate pools will be topped off if they fall below this value */
65#define PVFS2_PRECREATE_LOW_THRESHOLD_DEFAULT 256
66
67/* types of permission checking that a server may need to perform for
68 * incoming requests
69 */
70enum PINT_server_req_permissions
71{
72    PINT_SERVER_CHECK_INVALID = 0, /* invalid request */
73    PINT_SERVER_CHECK_WRITE = 1,   /* needs write permission */
74    PINT_SERVER_CHECK_READ = 2,    /* needs read permission */
75    PINT_SERVER_CHECK_NONE = 3,    /* needs no permission */
76    PINT_SERVER_CHECK_ATTR = 4,    /* special case for attribute operations;
77                                      needs ownership */
78    PINT_SERVER_CHECK_CRDIRENT = 5 /* special case for crdirent operations;
79                                      needs write and execute */
80};
81
82#define PINT_GET_OBJECT_REF_DEFINE(req_name)                             \
83static inline int PINT_get_object_ref_##req_name(                        \
84    struct PVFS_server_req *req, PVFS_fs_id *fs_id, PVFS_handle *handle) \
85{                                                                        \
86    *fs_id = req->u.req_name.fs_id;                                      \
87    *handle = req->u.req_name.handle;                                    \
88    return 0;                                                            \
89}
90
91enum PINT_server_req_access_type PINT_server_req_readonly(
92                                    struct PVFS_server_req *req);
93enum PINT_server_req_access_type PINT_server_req_modify(
94                                    struct PVFS_server_req *req);
95
96struct PINT_server_req_params
97{
98    const char* string_name;
99
100    /* For each request that specifies an object ref (fsid,handle) we
101     * get the common attributes on that object and check the permissions.
102     * For the request to proceed the permissions required by this flag
103     * must be met.
104     */
105    enum PINT_server_req_permissions perm;
106
107    /* Specifies the type of access on the object (readonly, modify).  This
108     * is used by the request scheduler to determine
109     * which requests to queue (block), and which to schedule (proceed).
110     * This is a callback implemented by the request.  For example, sometimes
111     * the io request writes, sometimes it reads.
112     * Default functions PINT_server_req_readonly and PINT_server_req_modify
113     * are used for requests that always require the same access type.
114     */
115    enum PINT_server_req_access_type (*access_type)(
116                                        struct PVFS_server_req *req);
117
118    /* Specifies the scheduling policy for the request.  In some cases,
119     * we can bypass the request scheduler and proceed directly with the
120     * request.
121     */
122    enum PINT_server_sched_policy sched_policy;
123
124    /* A callback implemented by the request to return the object reference
125     * from the server request structure.
126     */
127    int (*get_object_ref)(
128        struct PVFS_server_req *req, PVFS_fs_id *fs_id, PVFS_handle *handle);
129
130    /* The state machine that performs the request */
131    struct PINT_state_machine_s *state_machine;
132};
133
134struct PINT_server_req_entry
135{
136    enum PVFS_server_op op_type;
137    struct PINT_server_req_params *params;
138};
139
140extern struct PINT_server_req_entry PINT_server_req_table[];
141
142int PINT_server_req_get_object_ref(
143    struct PVFS_server_req *req, PVFS_fs_id *fs_id, PVFS_handle *handle);
144
145enum PINT_server_req_permissions
146PINT_server_req_get_perms(struct PVFS_server_req *req);
147enum PINT_server_req_access_type
148PINT_server_req_get_access_type(struct PVFS_server_req *req);
149enum PINT_server_sched_policy
150PINT_server_req_get_sched_policy(struct PVFS_server_req *req);
151
152const char* PINT_map_server_op_to_string(enum PVFS_server_op op);
153
154/* used to keep a random, but handy, list of keys around */
155typedef struct PINT_server_trove_keys
156{
157    char *key;
158    int size;
159} PINT_server_trove_keys_s;
160
161extern PINT_server_trove_keys_s Trove_Common_Keys[];
162/* Reserved keys */
163enum
164{
165    ROOT_HANDLE_KEY      = 0,
166    DIR_ENT_KEY          = 1,
167    METAFILE_HANDLES_KEY = 2,
168    METAFILE_DIST_KEY    = 3,
169    SYMLINK_TARGET_KEY   = 4,
170    METAFILE_LAYOUT_KEY  = 5,
171    NUM_DFILES_REQ_KEY   = 6
172};
173
174/* optional; user-settable keys */
175enum
176{
177    DIST_NAME_KEY        = 0,
178    DIST_PARAMS_KEY      = 1,
179    NUM_DFILES_KEY       = 2,
180    NUM_SPECIAL_KEYS     = 3, /* not an index */
181    METAFILE_HINT_KEY    = 3,
182};
183
184typedef enum
185{
186    SERVER_DEFAULT_INIT        = 0,
187    SERVER_GOSSIP_INIT         = (1 << 0),
188    SERVER_CONFIG_INIT         = (1 << 1),
189    SERVER_ENCODER_INIT        = (1 << 2),
190    SERVER_BMI_INIT            = (1 << 3),
191    SERVER_TROVE_INIT          = (1 << 4),
192    SERVER_FLOW_INIT           = (1 << 5),
193    SERVER_JOB_INIT            = (1 << 6),
194    SERVER_JOB_CTX_INIT        = (1 << 7),
195    SERVER_REQ_SCHED_INIT      = (1 << 8),
196    SERVER_STATE_MACHINE_INIT  = (1 << 9),
197    SERVER_BMI_UNEXP_POST_INIT = (1 << 10),
198    SERVER_SIGNAL_HANDLER_INIT = (1 << 11),
199    SERVER_JOB_OBJS_ALLOCATED  = (1 << 12),
200    SERVER_PERF_COUNTER_INIT   = (1 << 13),
201    SERVER_EVENT_INIT          = (1 << 14),
202    SERVER_JOB_TIME_MGR_INIT   = (1 << 15),
203    SERVER_DIST_INIT           = (1 << 16),
204    SERVER_CACHED_CONFIG_INIT  = (1 << 17),
205    SERVER_PRECREATE_INIT  = (1 << 18),
206} PINT_server_status_flag;
207
208struct PINT_server_create_op
209{
210    const char **io_servers;
211    const char **remote_io_servers;
212    int num_io_servers;
213    PVFS_handle* handle_array_local;
214    PVFS_handle* handle_array_remote;
215    int handle_array_local_count;
216    int handle_array_remote_count;
217    PVFS_error saved_error_code;
218    int handle_index;
219};
220
221/* struct PINT_server_lookup_op
222 *
223 * All the data needed during lookup processing:
224 *
225 */
226struct PINT_server_lookup_op
227{
228    /* current segment (0..N), number of segments in the path */
229    int seg_ct, seg_nr;
230
231    /* number of attrs read succesfully */
232    int attr_ct;
233
234    /* number of handles read successfully */
235    int handle_ct;
236
237    char *segp;
238    void *segstate;
239
240    PVFS_handle dirent_handle;
241    PVFS_ds_attributes *ds_attr_array;
242};
243
244struct PINT_server_readdir_op
245{
246    uint64_t directory_version;
247    PVFS_handle dirent_handle;  /* holds handle of dirdata dspace from
248                                   which entries are read */
249    PVFS_size dirdata_size;
250};
251
252struct PINT_server_crdirent_op
253{
254    char *name;
255    PVFS_handle new_handle;
256    PVFS_handle parent_handle;
257    PVFS_fs_id fs_id;
258    PVFS_handle dirent_handle;  /* holds handle of dirdata dspace that
259                                 * we'll write the dirent into */
260    PVFS_size dirent_count;
261    int dir_attr_update_required;
262};
263
264struct PINT_server_rmdirent_op
265{
266    PVFS_handle dirdata_handle;
267    PVFS_handle entry_handle; /* holds handle of dirdata object,
268                               * removed entry */
269    PVFS_size dirent_count;
270    int dir_attr_update_required;
271};
272
273struct PINT_server_chdirent_op
274{
275    PVFS_handle dirdata_handle;
276    PVFS_handle old_dirent_handle;
277    PVFS_handle new_dirent_handle;
278    int dir_attr_update_required;
279};
280
281struct PINT_server_remove_op
282{
283    PVFS_handle handle;
284    PVFS_fs_id fs_id;
285    PVFS_handle dirdata_handle;   /* holds dirdata dspace handle in
286                                   * the event that we are removing a
287                                   * directory */
288    PVFS_size dirent_count;
289    PVFS_ds_keyval key;
290    PVFS_ds_position pos;
291    int key_count;
292    int index;
293    int remove_keyvals_state;
294};
295
296struct PINT_server_mgmt_remove_dirent_op
297{
298    PVFS_handle dirdata_handle;
299};
300
301struct PINT_server_precreate_pool_refiller_op
302{
303    PVFS_handle pool_handle;
304    PVFS_handle* precreate_handle_array;
305    PVFS_fs_id fsid;
306    char* host;
307    PVFS_BMI_addr_t host_addr;
308    PVFS_handle_extent_array data_handle_extent_array;
309};
310
311struct PINT_server_batch_create_op
312{
313    int saved_error_code;
314    int batch_index;
315};
316
317struct PINT_server_batch_remove_op
318{
319    int handle_index;
320    int error_code;
321};
322
323struct PINT_server_mgmt_get_dirdata_op
324{
325    PVFS_handle dirdata_handle;
326};
327
328struct PINT_server_getconfig_op
329{
330    int strsize; /* used to hold string lengths during getconfig
331                  * processing */
332};
333
334struct PINT_server_io_op
335{
336    gen_mutex_t mutex;
337    PINT_segpool_handle_t seg_handle;
338    PINT_Request *file_req;
339    PVFS_offset file_req_offset;
340    PINT_Request *mem_req;
341   
342    void *user_ptr;
343
344    void *tmp_buffer;
345    PVFS_size count; /* for MEAN operation */
346
347    PVFS_size aggregate_size;
348
349    PINT_request_file_data file_data;
350 
351    int buffer_size;
352    int num_of_buffers;
353   
354    PVFS_size total_transferred;
355
356    int parallel_sms;
357};
358
359/* substibute for flow */
360struct PINT_server_pipeline_op
361{
362    PVFS_fs_id fs_id;
363    PVFS_handle handle;
364    PVFS_BMI_addr_t address;
365
366    PVFS_handle *dfile_array;
367    int dfile_index; /* can be used for Rank */
368    int dfile_count;
369    struct PINT_dist_s *dist;
370    PINT_request_file_data file_data;
371
372    PINT_Request *file_req;
373    PVFS_offset file_req_offset;
374    PINT_Request *mem_req;
375
376    /* for strip alignment */
377    char tmp_buf[128]; /* FIXME */
378    PVFS_size unaligned_size;
379
380    enum PVFS_io_type io_type;
381
382    /* AS: operator and data type */
383    int op;
384    int datatype;
385
386    char *buffer;
387    PVFS_size buffer_size;
388    PVFS_size buffer_used;
389    PVFS_size out_size;
390    PINT_segpool_handle_t seg_handle;
391    PINT_segpool_unit_id id;
392    PVFS_offset *offsets;
393    PVFS_size *sizes;
394    int segs;
395    PVFS_hint hints;
396    PVFS_msg_tag_t tag;
397    int trove_sync_flag;
398    PVFS_offset loff;
399    int parallel_sms;
400};
401 
402/* allreduce */
403struct PINT_server_allreduce_op
404{
405    int type; /* SEND (0) or RECV (1) */ /* FIXME: not used anymore? */
406    int op; /* {SUM, MAX, MIN} = {0, 1, 2} */
407    int datatype; /* MPI_INT, MPI_FLOAT, MPI_DOUBLE */
408    PVFS_fs_id fs_id;
409    PVFS_hint hints;
410    PVFS_handle *dfile_array;
411    int myRank;
412    int tree_depth;
413    int current_depth;
414    void *send_buf;
415    void *recv_buf;
416    PVFS_size buf_sz;
417    int mask;
418};
419
420/* send_recv */
421struct PINT_server_send_recv_op
422{
423    int type;
424    int myRank;
425    int mask;
426};
427
428/* bcast */
429struct PINT_server_bcast_op
430{
431    int type; /* SEND (0) or RECV (1) */
432    int datatype; /* MPI_INT, MPI_FLOAT, MPI_DOUBLE */
433    PVFS_fs_id fs_id;
434    PVFS_hint hints;
435    PVFS_handle *dfile_array;
436    int dfile_count;
437    int myRank;
438    int tree_depth;
439    int index;
440    void *send_buf;
441    void *recv_buf;
442    PVFS_size buf_sz;
443    int mask;
444};
445
446
447struct PINT_server_kmeans_op
448{
449    int myRank;
450    PVFS_handle *dfile_array;
451    int dfile_count;
452    PVFS_fs_id fs_id;
453
454    int loop;
455    int numClusters;
456    int numCoords;
457    int numObjs; /* local number of objs */
458    int totalNumObjs;
459    int *newClusterSize;
460    int *clusterSize;
461    float delta;
462    float delta_tmp;
463    float **newClusters;
464    int *membership; /* [numObjs] */
465    float **objects; /* [numObjs][numCoords] data objects */
466    float **clusters; /* [numClusters][numCoords] cluster center */
467    float threshold;
468    int allreduce_step;
469};
470
471struct PINT_server_small_io_op
472{
473    PVFS_offset offsets[IO_MAX_REGIONS];
474    PVFS_size sizes[IO_MAX_REGIONS];
475    PVFS_size result_bytes;
476};
477
478struct PINT_server_flush_op
479{
480    PVFS_handle handle;     /* handle of data we want to flush to disk */
481    int flags;              /* any special flags for flush */
482};
483
484struct PINT_server_truncate_op
485{
486    PVFS_handle handle;     /* handle of datafile we resize */
487    PVFS_offset size;       /* new size of datafile */
488};
489
490struct PINT_server_mkdir_op
491{
492    PVFS_fs_id fs_id;
493    PVFS_handle_extent_array handle_extent_array;
494    PVFS_handle dirent_handle;
495    PVFS_size init_dirdata_size;
496};
497
498struct PINT_server_getattr_op
499{
500    PVFS_handle handle;
501    PVFS_fs_id fs_id;
502    PVFS_ds_attributes dirdata_ds_attr;
503    uint32_t attrmask;
504    PVFS_error* err_array;
505    PVFS_ds_keyval_handle_info keyval_handle_info;
506    PVFS_handle dirent_handle;
507    int num_dfiles_req;
508};
509
510struct PINT_server_listattr_op
511{
512    PVFS_object_attr *attr_a;
513    PVFS_ds_attributes *ds_attr_a;
514    PVFS_error *errors;
515    int parallel_sms;
516};
517
518/* this is used in both set_eattr, get_eattr and list_eattr */
519struct PINT_server_eattr_op
520{
521    void *buffer;
522};
523
524struct PINT_server_unstuff_op
525{
526    PVFS_handle* dfile_array;
527    int num_dfiles_req;
528    PVFS_sys_layout layout;
529    void* encoded_layout;
530};
531
532/* This structure is passed into the void *ptr
533 * within the job interface.  Used to tell us where
534 * to go next in our state machine.
535 */
536typedef struct PINT_server_op
537{
538    struct qlist_head   next; /* used to queue structures used for unexp style messages */
539    int op_cancelled; /* indicates unexp message was cancelled */
540    job_id_t unexp_id;
541
542    enum PVFS_server_op op;  /* type of operation that we are servicing */
543
544    PINT_event_id event_id;
545
546    /* holds id from request scheduler so we can release it later */
547    job_id_t scheduled_id;
548
549    /* generic structures used in most server operations */
550    PVFS_ds_keyval key, val;
551    PVFS_ds_keyval *key_a;
552    PVFS_ds_keyval *val_a;
553    int *error_a;
554    int keyval_count;
555
556    int free_val;
557
558    /* attributes structure associated with target of operation; may be
559     * partially filled in by prelude nested state machine (for
560     * permission checking); may be used/modified by later states as well
561     *
562     * the ds_attr is used by the prelude sm only (and for pulling the
563     * size out in the get-attr server sm); don't use it otherwise --
564     * the object_attr is prepared for other sm's, so use it instead.
565     */
566    PVFS_ds_attributes ds_attr;
567    PVFS_object_attr attr;
568
569    PVFS_BMI_addr_t addr;   /* address of client that contacted us */
570    bmi_msg_tag_t tag; /* operation tag */
571    /* information about unexpected message that initiated this operation */
572    struct BMI_unexpected_info unexp_bmi_buff;
573
574    /* decoded request and response structures */
575    struct PVFS_server_req *req;
576    struct PVFS_server_resp resp;
577    /* encoded request and response structures */
578    struct PINT_encoded_msg encoded;
579    struct PINT_decoded_msg decoded;
580
581    PINT_sm_msgarray_op msgarray_op;
582
583    PVFS_handle target_handle;
584    PVFS_fs_id target_fs_id;
585    PVFS_object_attr *target_object_attr;
586
587    enum PINT_server_req_access_type access_type;
588    enum PINT_server_sched_policy sched_policy;
589
590    union
591    {
592        /* request-specific scratch spaces for use during processing */
593        struct PINT_server_create_op create;
594        struct PINT_server_eattr_op eattr;
595        struct PINT_server_getattr_op getattr;
596        struct PINT_server_listattr_op listattr;
597        struct PINT_server_getconfig_op getconfig;
598        struct PINT_server_lookup_op lookup;
599        struct PINT_server_crdirent_op crdirent;
600        struct PINT_server_readdir_op readdir;
601        struct PINT_server_remove_op remove;
602        struct PINT_server_chdirent_op chdirent;
603        struct PINT_server_rmdirent_op rmdirent;
604        struct PINT_server_io_op io;
605        struct PINT_server_small_io_op small_io;
606        struct PINT_server_pipeline_op pipeline;
607        struct PINT_server_allreduce_op allreduce;
608        struct PINT_server_send_recv_op send_recv;
609        struct PINT_server_bcast_op bcast;
610        struct PINT_server_kmeans_op kmeans;
611        struct PINT_server_flush_op flush;
612        struct PINT_server_truncate_op truncate;
613        struct PINT_server_mkdir_op mkdir;
614        struct PINT_server_mgmt_remove_dirent_op mgmt_remove_dirent;
615        struct PINT_server_mgmt_get_dirdata_op mgmt_get_dirdata_handle;
616        struct PINT_server_precreate_pool_refiller_op
617            precreate_pool_refiller;
618        struct PINT_server_batch_create_op batch_create;
619        struct PINT_server_batch_remove_op batch_remove;
620        struct PINT_server_unstuff_op unstuff;
621    } u;
622
623} PINT_server_op;
624
625/* PINT_ACCESS_DEBUG()
626 *
627 * macro for consistent printing of access records
628 *
629 * no return value
630 */
631#ifdef GOSSIP_DISABLE_DEBUG
632#define PINT_ACCESS_DEBUG(__s_op, __mask, format, f...) do {} while (0)
633#else
634#define PINT_ACCESS_DEBUG(__s_op, __mask, format, f...)                     \
635    PINT_server_access_debug(__s_op, __mask, format, ##f)
636#endif
637
638void PINT_server_access_debug(PINT_server_op * s_op,
639                              int64_t debug_mask,
640                              const char * format,
641                              ...) __attribute__((format(printf, 3, 4)));
642
643/* nested state machines */
644extern struct PINT_state_machine_s pvfs2_get_attr_work_sm;
645extern struct PINT_state_machine_s pvfs2_prelude_sm;
646extern struct PINT_state_machine_s pvfs2_prelude_work_sm;
647extern struct PINT_state_machine_s pvfs2_final_response_sm;
648extern struct PINT_state_machine_s pvfs2_check_entry_not_exist_sm;
649extern struct PINT_state_machine_s pvfs2_remove_work_sm;
650extern struct PINT_state_machine_s pvfs2_mkdir_work_sm;
651extern struct PINT_state_machine_s pvfs2_unexpected_sm;
652extern struct PINT_state_machine_s pvfs2_pipeline_sm; /* sson */
653extern struct PINT_state_machine_s pvfs2_allreduce_sm; /* sson */
654extern struct PINT_state_machine_s pvfs2_bcast_sm; /* sson */
655extern struct PINT_state_machine_s pvfs2_kmeans_sm; /* sson */
656extern struct PINT_state_machine_s pvfs2_send_recv_sm; /* sson */
657
658/* Exported Prototypes */
659struct server_configuration_s *get_server_config_struct(void);
660
661/* exported state machine resource reclamation function */
662int server_post_unexpected_recv(job_status_s *js_p);
663int server_state_machine_start( PINT_smcb *smcb, job_status_s *js_p);
664int server_state_machine_complete(PINT_smcb *smcb);
665int server_state_machine_terminate(PINT_smcb *smcb, job_status_s *js_p);
666
667/* lists of server ops */
668extern struct qlist_head posted_sop_list;
669extern struct qlist_head inprogress_sop_list;
670
671/* starts state machines not associated with an incoming request */
672int server_state_machine_alloc_noreq(
673    enum PVFS_server_op op, struct PINT_smcb ** new_op);
674int server_state_machine_start_noreq(
675    struct PINT_smcb *new_op);
676
677/* INCLUDE STATE-MACHINE.H DOWN HERE */
678#if 0
679#define PINT_OP_STATE       PINT_server_op
680#define PINT_OP_STATE_GET_MACHINE(_op) \
681    ((_op >= 0 && _op < PVFS_SERV_NUM_OPS) ? \
682    PINT_server_req_table[_op].params->sm : NULL)
683#endif
684
685#include "pvfs2-internal.h"
686
687struct PINT_state_machine_s *server_op_state_get_machine(int);
688
689#endif /* __PVFS_SERVER_H */
690
691/*
692 * Local variables:
693 *  c-indent-level: 4
694 *  c-basic-offset: 4
695 * End:
696 *
697 * vim: ts=8 sts=4 sw=4 expandtab
698 */
Note: See TracBrowser for help on using the browser.